Skip to content

Commit 2aeb176

Browse files
committed
misc documentation, some work on rpy2 interface. near git migration
git-svn-id: http://pandas.googlecode.com/svn/trunk@202 d5231056-7de3-11de-ac95-d976489f1ece
1 parent 3f3508f commit 2aeb176

19 files changed

+639
-31
lines changed

.coveragerc

+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# .coveragerc to control coverage.py
2+
[run]
3+
branch = False
4+
5+
[report]
6+
# Regexes for lines to exclude from consideration
7+
exclude_lines =
8+
# Have to re-enable the standard pragma
9+
pragma: no cover
10+
11+
# Don't complain about missing debug-only code:
12+
def __repr__
13+
if self\.debug
14+
15+
# Don't complain if tests don't hit defensive assertion code:
16+
raise AssertionError
17+
raise NotImplementedError
18+
19+
# Don't complain if non-runnable code isn't run:
20+
if 0:
21+
if __name__ == .__main__.:
22+
23+
ignore_errors = False
24+
25+
[html]
26+
directory = coverage_html_report

LICENSE.txt renamed to LICENSE

File renamed without changes.

README.txt renamed to README.rst

File renamed without changes.

RELEASE.rst

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
=============
2+
Release Notes
3+
=============
4+
5+
pandas 0.3.0
6+
============
7+
8+
**Release date:**
9+
10+
**New features / modules**
11+
12+
**Improvements**
13+
14+
**API Changes**
15+
16+
**Bug fixes**
17+

TODO.txt renamed to TODO.rst

File renamed without changes.

bench/alignment.py

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# Setup
2+
import numpy as np
3+
import pandas
4+
import la
5+
N = 1000
6+
K = 50
7+
arr1 = np.random.randn(N, K)
8+
arr2 = np.random.randn(N, K)
9+
idx1 = range(N)
10+
idx2 = range(K)
11+
12+
# pandas
13+
dma1 = pandas.DataMatrix(arr1, idx1, idx2)
14+
dma2 = pandas.DataMatrix(arr2, idx1[::-1], idx2[::-1])
15+
16+
# larry
17+
lar1 = la.larry(arr1, [idx1, idx2])
18+
lar2 = la.larry(arr2, [idx1[::-1], idx2[::-1]])
19+
20+
for i in range(100):
21+
result = lar1 + lar2

bench/serialize.py

+80
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
import time, os
2+
import numpy as np
3+
4+
import la
5+
import pandas
6+
7+
def timeit(f, iterations):
8+
start = time.clock()
9+
10+
for i in xrange(iterations):
11+
f()
12+
13+
return time.clock() - start
14+
15+
def roundtrip_archive(N, iterations=10):
16+
17+
# Create data
18+
arr = np.random.randn(N, N)
19+
lar = la.larry(arr)
20+
dma = pandas.DataMatrix(arr, range(N), range(N))
21+
22+
# filenames
23+
filename_numpy = '/Users/wesm/tmp/numpy.npz'
24+
filename_larry = '/Users/wesm/tmp/archive.hdf5'
25+
filename_pandas = '/Users/wesm/tmp/pandas_tmp'
26+
27+
# Delete old files
28+
try:
29+
os.unlink(filename_numpy)
30+
except:
31+
pass
32+
try:
33+
os.unlink(filename_larry)
34+
except:
35+
pass
36+
try:
37+
os.unlink(filename_pandas)
38+
except:
39+
pass
40+
41+
# Time a round trip save and load
42+
numpy_f = lambda: numpy_roundtrip(filename_numpy, arr, arr)
43+
numpy_time = timeit(numpy_f, iterations) / iterations
44+
45+
larry_f = lambda: larry_roundtrip(filename_larry, lar, lar)
46+
larry_time = timeit(larry_f, iterations) / iterations
47+
48+
pandas_f = lambda: pandas_roundtrip(filename_pandas, dma, dma)
49+
pandas_time = timeit(pandas_f, iterations) / iterations
50+
51+
print 'Numpy (npz) %7.4f seconds' % numpy_time
52+
print 'larry (HDF5) %7.4f seconds' % larry_time
53+
print 'pandas (HDF5) %7.4f seconds' % pandas_time
54+
55+
def numpy_roundtrip(filename, arr1, arr2):
56+
np.savez(filename, arr1=arr1, arr2=arr2)
57+
npz = np.load(filename)
58+
arr1 = npz['arr1']
59+
arr2 = npz['arr2']
60+
61+
def larry_roundtrip(filename, lar1, lar2):
62+
io = la.IO(filename)
63+
io['lar1'] = lar1
64+
io['lar2'] = lar2
65+
lar1 = io['lar1']
66+
lar2 = io['lar2']
67+
68+
def pandas_roundtrip(filename, dma1, dma2):
69+
from pandas.io.pytables import HDFStore
70+
store = HDFStore(filename)
71+
store['dma1'] = dma1
72+
store['dma2'] = dma2
73+
dma1 = store['dma1']
74+
dma2 = store['dma2']
75+
76+
def pandas_roundtrip_pickle(filename, dma1, dma2):
77+
dma1.save(filename)
78+
dma1 = pandas.DataMatrix.load(filename)
79+
dma2.save(filename)
80+
dma2 = pandas.DataMatrix.load(filename)

bench/test.py

+65
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
import numpy as np
2+
import itertools
3+
import collections
4+
import scipy.ndimage as ndi
5+
6+
N = 10000
7+
8+
lat = np.random.randint(0, 360, N)
9+
lon = np.random.randint(0, 360, N)
10+
data = np.random.randn(N)
11+
12+
def groupby1(lat, lon, data):
13+
indexer = np.lexsort((lon, lat))
14+
lat = lat.take(indexer)
15+
lon = lon.take(indexer)
16+
sorted_data = data.take(indexer)
17+
18+
keys = 1000. * lat + lon
19+
unique_keys = np.unique(keys)
20+
bounds = keys.searchsorted(unique_keys)
21+
22+
result = group_agg(sorted_data, bounds, lambda x: x.mean())
23+
24+
decoder = keys.searchsorted(unique_keys)
25+
26+
return dict(zip(zip(lat.take(decoder), lon.take(decoder)), result))
27+
28+
def group_mean(lat, lon, data):
29+
indexer = np.lexsort((lon, lat))
30+
lat = lat.take(indexer)
31+
lon = lon.take(indexer)
32+
sorted_data = data.take(indexer)
33+
34+
keys = 1000 * lat + lon
35+
unique_keys = np.unique(keys)
36+
37+
result = ndi.mean(sorted_data, labels=keys, index=unique_keys)
38+
decoder = keys.searchsorted(unique_keys)
39+
40+
return dict(zip(zip(lat.take(decoder), lon.take(decoder)), result))
41+
42+
def group_mean_naive(lat, lon, data):
43+
grouped = collections.defaultdict(list)
44+
for lt, ln, da in zip(lat, lon, data):
45+
grouped[(lt, ln)].append(da)
46+
47+
averaged = dict((ltln, np.mean(da)) for ltln, da in grouped.items())
48+
49+
return averaged
50+
51+
def group_agg(values, bounds, f):
52+
N = len(values)
53+
result = np.empty(len(bounds), dtype=float)
54+
for i, left_bound in enumerate(bounds):
55+
if i == len(bounds) - 1:
56+
right_bound = N
57+
else:
58+
right_bound = bounds[i + 1]
59+
60+
result[i] = f(values[left_bound : right_bound])
61+
62+
return result
63+
64+
# for i in range(10):
65+
# groupby1(lat, lon, data)

doc/source/r_guide.rst

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
.. currentmodule:: pandas
2+
3+
.. r_guide:
4+
5+
pandas for R users
6+
------------------

pandas/core/tests/test_common.py

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
from pandas.core.common import notnull, isnull
2+
import pandas.core.common as common
3+
4+
import numpy as np
5+
6+
def test_notnull():
7+
assert notnull(1.)
8+
assert not notnull(None)
9+
assert not notnull(np.NaN)
10+
assert not notnull(np.inf)
11+
assert not notnull(-np.inf)
12+
13+
def test_isnull():
14+
assert not isnull(1.)
15+
assert isnull(None)
16+
assert isnull(np.NaN)
17+
assert isnull(np.inf)
18+
assert isnull(-np.inf)
19+

pandas/lib/src/moments.pyx

+36-10
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,13 @@
11
# Cython implementations of rolling sum, mean, variance, skewness,
22
# other statistical moment functions
3+
#
4+
# Misc implementation notes
5+
# -------------------------
6+
#
7+
# - In Cython x * x is faster than x ** 2 for C types, this should be
8+
# periodically revisited to see if it's still true.
9+
#
10+
# -
311

412
# original C implementation by N. Devillard.
513
# This code in public domain.
@@ -32,18 +40,15 @@ def kth_smallest(ndarray[double_t, ndim=1] a, int k):
3240
j = m
3341

3442
while 1:
35-
while a[i] < x:
36-
i += 1
37-
while x < a[j]:
38-
j -= 1
43+
while a[i] < x: i += 1
44+
while x < a[j]: j -= 1
3945
if i <= j:
4046
t = a[i]
4147
a[i] = a[j]
4248
a[j] = t
43-
i += 1
44-
j -= 1
45-
if i > j:
46-
break
49+
i += 1; j -= 1
50+
51+
if i > j: break
4752

4853
if j < k: l = i
4954
if k < i: m = j
@@ -158,8 +163,20 @@ def roll_mean(ndarray[double_t, ndim=1] input,
158163
#-------------------------------------------------------------------------------
159164
# Exponentially weighted moving average
160165

161-
def ewma(ndarray[double_t, ndim=1] input,
162-
int com):
166+
def ewma(ndarray[double_t, ndim=1] input, double_t com):
167+
'''
168+
Compute exponentially-weighted moving average using center-of-mass.
169+
170+
Parameters
171+
----------
172+
input : ndarray (float64 type)
173+
com : float64
174+
175+
Returns
176+
-------
177+
y : ndarray
178+
'''
179+
163180
cdef double cur, prev, neww, oldw, adj
164181
cdef int i
165182
cdef int N = len(input)
@@ -423,12 +440,21 @@ cdef _roll_skiplist_op(ndarray arg, int win, int minp, skiplist_f op):
423440
return output
424441

425442
def roll_median(ndarray input, int win, int minp):
443+
'''
444+
O(N log(window)) implementation using skip list
445+
'''
426446
return _roll_skiplist_op(input, win, minp, _get_median)
427447

428448
def roll_max(ndarray input, int win, int minp):
449+
'''
450+
O(N log(window)) implementation using skip list
451+
'''
429452
return _roll_skiplist_op(input, win, minp, _get_max)
430453

431454
def roll_min(ndarray input, int win, int minp):
455+
'''
456+
O(N log(window)) implementation using skip list
457+
'''
432458
return _roll_skiplist_op(input, win, minp, _get_min)
433459

434460
cdef double_t _get_median(IndexableSkiplist skiplist, int nobs,

0 commit comments

Comments
 (0)