pandas入门
源码点这里
from pandas
import Series,DataFrame
import pandas
as pd
pandas的两个主要数据结构:Series和DataFrame
Series 是一中类似于一维数组的对象,它由一组数据以及一组与之相关的数据标签(即索引)组成。
obj=Series([
4,
7,-
5,
3])
obj 0 4 1 7 2 -5 3 3 dtype: int64
obj.values
obj.index RangeIndex(start=0, stop=4, step=1)
obj2=Series([
4,
7,-
5,
3],index=[
'd',
'b',
'a',
'c'])
obj2 d 4 b 7 a -5 c 3 dtype: int64
obj2[
'a'] -5
obj2[
'd'] 4
obj2[[
'c',
'a',
'd']] c 3 a -5 d 4 dtype: int64
obj2[obj2>
0] d 4 b 7 c 3 dtype: int64
obj2*
2 d 8 b 14 a -10 c 6 dtype: int64
import numpy
as np
np.exp(obj2) d 54.598150 b 1096.633158 a 0.006738 c 20.085537 dtype: float64
sdata={
'0hio':
35000,
'Texta':
79300,
'Ohjsodf':
16000,
'Jsdf':
5000}
obj3=Series(sdata)
obj3 0hio 35000 Jsdf 5000 Ohjsodf 16000 Texta 79300 dtype: int64
states=[
'California',
'0hio',
'Texta',
'Ohjsodf']
obj4=Series(sdata,index=states)
obj4 California NaN 0hio 35000.0 Texta 79300.0 Ohjsodf 16000.0 dtype: float64
pd.isnull(obj4) California True 0hio False Texta False Ohjsodf False dtype: bool
pd.notnull(obj4) California False 0hio True Texta True Ohjsodf True dtype: bool
obj4.isnull() California True 0hio False Texta False Ohjsodf False dtype: bool **下面重点关注如何处理缺失数据**
obj3 0hio 35000 Jsdf 5000 Ohjsodf 16000 Texta 79300 dtype: int64
obj4 California NaN 0hio 35000.0 Texta 79300.0 Ohjsodf 16000.0 dtype: float64
obj3+obj4 0hio 70000.0 California NaN Jsdf NaN Ohjsodf 32000.0 Texta 158600.0 dtype: float64
obj4.name=
'population'
obj4.index.name=
'state'
obj4 state California NaN 0hio 35000.0 Texta 79300.0 Ohjsodf 16000.0 Name: population, dtype: float64
obj 0 4 1 7 2 -5 3 3 dtype: int64
obj.index=[
'Bob',
'Steve',
'Jeff',
'Ryan']
obj Bob 4 Steve 7 Jeff -5 Ryan 3 dtype: int64
DataFrame
####DataFrame是一个表格型的数据结构,它含有一组有序的列,每列可以是不同的值类型(数值、字符串、布尔值等) ####DataFrame既又行索引也有列索引,它可以被看做Series组成的字典(共同用一个索引)
data={
'state':[
'0hi0',
'0hio',
'0hio',
'Nevada',
'Nevada'],
'year':[
2000,
2001,
2002,
2001,
2002],
'pop':[
1.5,
1.7,
3.6,
2.4,
2.9]
}
frame=DataFrame(data)
frame
popstateyear
01.50hi0200011.70hio200123.60hio200232.4Nevada200142.9Nevada2002
DataFrame(data,columns=[
'year',
'state',
'pop'])
yearstatepop
020000hi01.5120010hio1.7220020hio3.632001Nevada2.442002Nevada2.9
frame2=DataFrame(data,columns=[
'year',
'state',
'pop',
'debt'],index=[
'one',
'two',
'three',
'four',
'five'])
frame2
yearstatepopdebt
one20000hi01.5NaNtwo20010hio1.7NaNthree20020hio3.6NaNfour2001Nevada2.4NaNfive2002Nevada2.9NaN
frame2.columns Index([‘year’, ‘state’, ‘pop’, ‘debt’], dtype=’object’)
frame2[
'state'] one 0hi0 two 0hio three 0hio four Nevada five Nevada Name: state, dtype: object
frame2.year one 2000 two 2001 three 2002 four 2001 five 2002 Name: year, dtype: int64
frame2.ix[
'three'] year 2002 state 0hio pop 3.6 debt NaN Name: three, dtype: object
frame2[
'debt']=
16.
frame2
frame2[
'debt']=np.arange(
5.)
frame2
yearstatepopdebt
one20000hi01.50.0two20010hio1.71.0three20020hio3.62.0four2001Nevada2.43.0five2002Nevada2.94.0
val=Series([-
1.2,-
1.5,-
1.7],index=[
'two',
'four',
'five'])
frame2[
'debt']=val
frame2
yearstatepopdebt
one20000hi01.5NaNtwo20010hio1.7-1.2three20020hio3.6NaNfour2001Nevada2.4-1.5five2002Nevada2.9-1.7
frame2[
'eastern']=frame2.state==
'0hi0'
frame2
yearstatepopdebteastern
one20000hi01.5NaNTruetwo20010hio1.7-1.2Falsethree20020hio3.6NaNFalsefour2001Nevada2.4-1.5Falsefive2002Nevada2.9-1.7False
del frame2[
'eastern']
frame2.columns Index([‘year’, ‘state’, ‘pop’, ‘debt’], dtype=’object’)
pop={
'Nevada':{
2001:
2.4,
2002:
2.9},
'0hio':{
2000:
1.5,
2001:
1.7,
2002:
3.6}}
frame3=DataFrame(pop)
frame3
0hioNevada
20001.5NaN20011.72.420023.62.9
frame3.T
200020012002
0hio1.51.73.6NevadaNaN2.42.9
DataFrame(pop,index=[
2001,
2002,
2003])
0hioNevada
20011.72.420023.62.92003NaNNaN
frame3.index.name=
'year'
frame3.columns.name=
'state'
frame3
state0hioNevadayear
20001.5NaN20011.72.420023.62.9
frame3.values array([[ 1.5, nan], [ 1.7, 2.4], [ 3.6, 2.9]])
frame2.values array([[2000, ‘0hi0’, 1.5, nan], [2001, ‘0hio’, 1.7, -1.2], [2002, ‘0hio’, 3.6, nan], [2001, ‘Nevada’, 2.4, -1.5], [2002, ‘Nevada’, 2.9, -1.7]], dtype=object)
索引对象
obj=Series(range(
3),index=[
'a',
'b',
'c'])
index=obj.index
index Index([‘a’, ‘b’, ‘c’], dtype=’object’)
index[
1:] Index([‘b’, ‘c’], dtype=’object’)
index[
1]=
'd' ————————————————————————— TypeError Traceback (most recent call last) in () 1 #Index对象是不能修改的(immutable),因此用户不能对其进行修改 —-> 2 index[1]=’d’ C:\Users\ZJL\AppData\Local\Programs\Python\Python35\lib\site-packages\pandas\indexes\base.py in __setitem__(self, key, value) 1402 1403 def __setitem__(self, key, value): -> 1404 raise TypeError(“Index does not support mutable operations”) 1405 1406 def __getitem__(self, key): TypeError: Index does not support mutable operations
index=pd.Index(np.arange(
3))
obj2=Series([
1.5,-
2.5,
0],index=index)
obj2.index
is index True
frame3
'0hio' in frame3.columns True
2002in frame3.index True ##重新索引
obj=Series([
4.5,
7.2,-
5.3,
3.6],index=[
'd',
'b',
'a',
'c'])
obj d 4.5 b 7.2 a -5.3 c 3.6 dtype: float64
obj2=obj.reindex([
'a',
'b',
'c',
'd',
'e'])
obj2 a -5.3 b 7.2 c 3.6 d 4.5 e NaN dtype: float64
obj.reindex([
'a',
'b',
'c',
'd',
'e'],fill_value=
0) a -5.3 b 7.2 c 3.6 d 4.5 e 0.0 dtype: float64
obj3=Series([
'blue',
'purple',
'yellow'],index=[
0,
2,
4])
obj3.reindex(range(
6),method=
'ffill') 0 blue 1 blue 2 purple 3 purple 4 yellow 5 yellow dtype: object
frame=DataFrame(np.arange(
9).reshape((
3,
3)),index=[
'a',
'b',
'c'],columns=[
'0hio',
'Texas',
'California'])
frame
0hioTexasCalifornia
a012b345c678
frame2=frame.reindex([
'a',
'b',
'c',
'd'])
frame2
0hioTexasCalifornia
a0.01.02.0b3.04.05.0c6.07.08.0dNaNNaNNaN
states=[
'Texas',
'Utah',
'California']
frame.reindex(columns=states)
TexasUtahCalifornia
a1NaN2b4NaN5c7NaN8
frame.reindex(index=[
'a',
'b',
'c',
'd'],method=
'ffill',columns=states)
TexasUtahCalifornia
a1NaN2b4NaN5c7NaN8d7NaN8
frame.ix[[
'a',
'b',
'c',
'd'],states]
TexasUtahCalifornia
a1.0NaN2.0b4.0NaN5.0c7.0NaN8.0dNaNNaNNaN
丢弃指定轴上的项
drop方法返回一个在指定轴上删除了指定值的新对象
obj=Series(np.arange(
5.0),index=[
'a',
'b',
'c',
'd',
'e'])
new_obj=obj.drop(
'c')
new_obj a 0.0 b 1.0 d 3.0 e 4.0 dtype: float64
obj.drop([
'd',
'c']) a 0.0 b 1.0 e 4.0 dtype: float64
data=DataFrame(np.arange(
16).reshape((
4,
4)),index=[
'0hio',
'Colorado',
'Utah',
'New York'],columns=[
'one',
'two',
'three',
'four'])
data
onetwothreefour
0hio0123Colorado4567Utah891011New York12131415
data.drop([
'Colorado',
'0hio'])
data.drop(
'two',axis=
1)
onethreefour
0hio023Colorado467Utah81011New York121415
data.drop([
'two',
'four'],axis=
1)
onethree
0hio02Colorado46Utah810New York1214
索引、选取和过滤
obj=Series(np.arange(
4.),index=[
'a',
'b',
'c',
'd'])
obj[
'b'] 1.0
obj[
1] 1.0
obj[
2:
4] c 2.0 d 3.0 dtype: float64
obj[[
'b',
'a',
'd']] b 1.0 a 0.0 d 3.0 dtype: float64
obj[[
1,
3]] b 1.0 d 3.0 dtype: float64
obj[obj<
2] a 0.0 b 1.0 dtype: float64
obj[
'b':
'c'] b 1.0 c 2.0 dtype: float64
obj[
'b':
'c']=
5
obj a 0.0 b 5.0 c 5.0 d 3.0 dtype: float64
data=DataFrame(np.arange(
16).reshape((
4,
4)),index=[
'0hio',
'Colorado',
'Utah',
'New York'],columns=[
'one',
'two',
'three',
'four'])
data
onetwothreefour
0hio0123Colorado4567Utah891011New York12131415
data[
'two'] 0hio 1 Colorado 5 Utah 9 New York 13 Name: two, dtype: int32
data[[
'two',
'three']]
twothree
0hio12Colorado56Utah910New York1314
data[:
2]
onetwothreefour
0hio0123Colorado4567
data[data[
'three']>
5]
onetwothreefour
Colorado4567Utah891011New York12131415
data<
5
onetwothreefour
0hioTrueTrueTrueTrueColoradoTrueFalseFalseFalseUtahFalseFalseFalseFalseNew YorkFalseFalseFalseFalse
data[data<
5]=
0
data
onetwothreefour
0hio0000Colorado0567Utah891011New York12131415
data.ix[
'Colorado',[
'two',
'three']] two 5 three 6 Name: Colorado, dtype: int32
data.ix[[
'Colorado',
'Utah'],[
3,
0,
1]]
fouronetwo
Colorado705Utah1189
data.ix[data.three>
5,:
3]
onetwothree
Colorado056Utah8910New York121314
算术运算和数据对齐
s1=Series([
7.3,-
2.5,
3.4,
1.5],index=[
'a',
'c',
'd',
'e'])
s2=Series([-
2.1,
3.6,-
1.5,
4,
3.1],index=[
'a',
'c',
'e',
'f',
'g'])
s1 a 7.3 c -2.5 d 3.4 e 1.5 dtype: float64
s2 a -2.1 c 3.6 e -1.5 f 4.0 g 3.1 dtype: float64
s1+s2 a 5.2 c 1.1 d NaN e 0.0 f NaN g NaN dtype: float64
df1=DataFrame(np.arange(
9.).reshape((
3,
3)),columns=list(
'bcd'),index=[
'0hio',
'Texas',
'Colorado'])
df2=DataFrame(np.arange(
12.).reshape((
4,
3)),columns=list(
'bde'),index=[
'Utah',
'0hio',
'Texas',
'Oregon'])
df1
bcd
0hio0.01.02.0Texas3.04.05.0Colorado6.07.08.0
df2
bde
Utah0.01.02.00hio3.04.05.0Texas6.07.08.0Oregon9.010.011.0
df1+df2
bcde
0hio3.0NaN6.0NaNColoradoNaNNaNNaNNaNOregonNaNNaNNaNNaNTexas9.0NaN12.0NaNUtahNaNNaNNaNNaN
df1=DataFrame(np.arange(
12.).reshape((
3,
4)),columns=list(
'abcd'))
df2=DataFrame(np.arange(
20.).reshape((
4,
5)),columns=list(
'abcde'))
df1
abcd
00.01.02.03.014.05.06.07.028.09.010.011.0
df2
abcde
00.01.02.03.04.015.06.07.08.09.0210.011.012.013.014.0315.016.017.018.019.0
df1+df2
abcde
00.02.04.06.0NaN19.011.013.015.0NaN218.020.022.024.0NaN3NaNNaNNaNNaNNaN
df1.add(df2,fill_value=
0)
abcde
00.02.04.06.04.019.011.013.015.09.0218.020.022.024.014.0315.016.017.018.019.0
df1.reindex(columns=df2.columns,fill_value=
0)
abcde
00.01.02.03.0014.05.06.07.0028.09.010.011.00
DataFrame和Series之间的运算
arr=np.arange(
12.).reshape((
3,
4))
arr array([[ 0., 1., 2., 3.], [ 4., 5., 6., 7.], [ 8., 9., 10., 11.]])
arr[
0] array([ 0., 1., 2., 3.])
arr-arr[
0] array([[ 0., 0., 0., 0.], [ 4., 4., 4., 4.], [ 8., 8., 8., 8.]])
frame=DataFrame(np.arange(
12.).reshape((
4,
3)),columns=list(
'bde'),index=[
'Utah',
'0hio',
'Texas',
'Oregon'])
frame
bde
Utah0.01.02.00hio3.04.05.0Texas6.07.08.0Oregon9.010.011.0
series=frame.ix[
0]
series b 0.0 d 1.0 e 2.0 Name: Utah, dtype: float64
frame-series
bde
Utah0.00.00.00hio3.03.03.0Texas6.06.06.0Oregon9.09.09.0
series2=Series(range(
3),index=[
'b',
'e',
'f'])
frame+series2
bdef
Utah0.0NaN3.0NaN0hio3.0NaN6.0NaNTexas6.0NaN9.0NaNOregon9.0NaN12.0NaN
series3=frame[
'd']
frame
bde
Utah0.01.02.00hio3.04.05.0Texas6.07.08.0Oregon9.010.011.0
series3 Utah 1.0 0hio 4.0 Texas 7.0 Oregon 10.0 Name: d, dtype: float64
frame.sub(series3,axis=
0)
bde
Utah-1.00.01.00hio-1.00.01.0Texas-1.00.01.0Oregon-1.00.01.0
函数应用和映射
frame=DataFrame(np.random.randn(
4,
3),columns=list(
'bde'),index=[
'Utah',
'0hio',
'Texas',
'Oregon'])
frame
bde
Utah-0.191031-0.004688-0.3299700hio0.7082490.265398-2.346897Texas1.064349-1.811846-0.899921Oregon0.334061-1.0585060.655632
np.abs(frame)
bde
Utah0.1910310.0046880.3299700hio0.7082490.2653982.346897Texas1.0643491.8118460.899921Oregon0.3340611.0585060.655632
f=
lambda x:x.max()-x.min()
frame.apply(f) b 1.255380 d 2.077245 e 3.002529 dtype: float64
frame.apply(f,axis=
1) Utah 0.325281 0hio 3.055145 Texas 2.876195 Oregon 1.714138 dtype: float64
def f(x):
return Series([x.min(),x.max()],index=[
'min',
'max'])
frame.apply(f)
bde
min-0.191031-1.811846-2.346897max1.0643490.2653980.655632
format=
lambda x:
'%.2f' %x
frame.applymap(format)
bde
Utah-0.19-0.00-0.330hio0.710.27-2.35Texas1.06-1.81-0.90Oregon0.33-1.060.66
frame[
'e'].map(format) Utah -0.33 0hio -2.35 Texas -0.90 Oregon 0.66 Name: e, dtype: object ###排序和排名
obj=Series(range(
4),index=[
'd',
'a',
'b',
'c'])
obj.sort_index() a 1 b 2 c 3 d 0 dtype: int32
obj=Series(range(
4),index=list(
'bacd'))
obj.sort_index() a 1 b 0 c 2 d 3 dtype: int32
frame=DataFrame(np.arange(
8).reshape((
2,
4)),index=[
'three',
'one'],columns=[
'd',
'a',
'b',
'c'])
frame.sort_index()
dabc
one4567three0123
frame.sort_index(axis=
1)
abcd
three1230one5674
frame.sort_index(axis=
1,ascending=
False)
dcba
three0321one4765
obj=Series([
4,
7,-
3,
2])
obj.sort_values() 2 -3 3 2 0 4 1 7 dtype: int64
obj=Series([
4,np.nan,
7,np.nan,-
3,
2])
obj.sort_values() 4 -3.0 5 2.0 0 4.0 2 7.0 1 NaN 3 NaN dtype: float64
frame=DataFrame({
'b':[
4,
7,-
3,
2],
'a':[
0,
1,
0,
1]})
frame
ab
00411720-3312
frame.sort_values(by=
'b')
ab
20-3312004117
frame.sort_values(by=[
'a',
'b'])
ab
20-3004312117
obj=Series([
7,-
5,
7,
4,
2,
0,
4])
obj.rank() 0 6.5 1 1.0 2 6.5 3 4.5 4 3.0 5 2.0 6 4.5 dtype: float64
obj.rank(method=
'first') 0 6.0 1 1.0 2 7.0 3 4.0 4 3.0 5 2.0 6 5.0 dtype: float64
obj.rank(ascending=
False,method=
'max') 0 2.0 1 7.0 2 2.0 3 4.0 4 5.0 5 6.0 6 4.0 dtype: float64
frame=DataFrame({
'b':[
4.3,
7,-
3,
2],
'a':[
0,
1,
0,
1],
'c':[-
2,
5,
8,-
2.5]})
frame
abc
004.3-2.0117.05.020-3.08.0312.0-2.5
frame.rank(axis=
1)
abc
02.03.01.011.03.02.022.01.03.032.03.01.0
带有重复值的轴索引
obj=Series(range(
5),index=[
'a',
'a',
'b',
'b',
'c'])
obj a 0 a 1 b 2 b 3 c 4 dtype: int32
obj.index.is_unique False
obj[
'a'] a 0 a 1 dtype: int32
obj[
'c'] 4
df=DataFrame(np.random.randn(
4,
3),index=[
'a',
'a',
'b',
'b'])
df
012
a-0.524361-0.145395-1.322196a-0.666326-0.4966121.486401b-0.395841-0.9211940.260437b-0.187285-0.4560141.434571
df.ix[
'b']
012
b-0.395841-0.9211940.260437b-0.187285-0.4560141.434571
汇总和计算描述统计
df=DataFrame([[
1.4,np.nan],[
7.1,-
4.5],[np.nan,np.nan],[
0.75,-
1.3]],index=[
'a',
'b',
'c',
'd'],columns=[
'one',
'two'])
df
onetwo
a1.40NaNb7.10-4.5cNaNNaNd0.75-1.3
df.sum() one 9.25 two -5.80 dtype: float64
df.sum(axis=
1) a 1.40 b 2.60 c NaN d -0.55 dtype: float64
df.mean(axis=
1,skipna=
False) a NaN b 1.300 c NaN d -0.275 dtype: float64
df.idxmax() one b two d dtype: object
df.cumsum()
onetwo
a1.40NaNb8.50-4.5cNaNNaNd9.25-5.8
df.describe()
onetwo
count3.0000002.000000mean3.083333-2.900000std3.4936852.262742min0.750000-4.50000025%1.075000-3.70000050%1.400000-2.90000075%4.250000-2.100000max7.100000-1.300000
obj=Series([
'a',
'a',
'b',
'c']*
4)
obj.describe() count 16 unique 3 top a freq 8 dtype: object ##相关系数与协方差
import pandas_datareader.data
as web
all_data={}
for ticker
in [
'AAPL',
'IBM',
'MSFT',
'GOOG']:
all_data[ticker]=web.get_data_yahoo(ticker,
'1/1/2000',
'1/1/2010')
price=DataFrame({tic:data[
'Adj Close']
for tic,data
in all_data.items()})
volume=DataFrame({tic:data[
'Volume']
for tic,data
in all_data.items()})
returns=price.pct_change()
returns.tail()
AAPLGOOGIBMMSFTDate
2009-12-240.0343390.0111170.0043850.0025872009-12-280.0122940.0070980.0133260.0054842009-12-29-0.011861-0.005571-0.0034770.0070582009-12-300.0121470.0053760.005461-0.0136992009-12-31-0.004300-0.004416-0.012597-0.015504
returns.MSFT.corr(returns.IBM) 0.49597963862836764
returns.corr()
AAPLGOOGIBMMSFT
AAPL1.0000000.4706760.4100110.424305GOOG0.4706761.0000000.3906890.443587IBM0.4100110.3906891.0000000.495980MSFT0.4243050.4435870.4959801.000000
returns.cov()
AAPLGOOGIBMMSFT
AAPL0.0010270.0003030.0002520.000309GOOG0.0003030.0005800.0001420.000205IBM0.0002520.0001420.0003670.000216MSFT0.0003090.0002050.0002160.000516
returns.corrwith(returns.IBM) AAPL 0.410011 GOOG 0.390689 IBM 1.000000 MSFT 0.495980 dtype: float64
returns.corrwith(volume) ————————————————————————— NameError Traceback (most recent call last) in () 1 #传入一个DataFrame则会计算按列名配对的相关系数。这里,计算百分比变化与成交量的相关系数 —-> 2 returns.corrwith(volume) NameError: name ‘returns’ is not defined
唯一值、值计数以及成员资格
obj=Series([
'c',
'a',
'd',
'a',
'a',
'b',
'b',
'c',
'c'])
uniques=obj.unique()
uniques array([‘c’, ‘a’, ‘d’, ‘b’], dtype=object)
obj.value_counts() c 3 a 3 b 2 d 1 dtype: int64
pd.value_counts(obj.values,sort=
False) d 1 b 2 a 3 c 3 dtype: int64
mask=obj.isin([
'b',
'c'])
mask 0 True 1 False 2 False 3 False 4 False 5 True 6 True 7 True 8 True dtype: bool
obj[mask] 0 c 5 b 6 b 7 c 8 c dtype: object
data=DataFrame({
'Qu1':[
1,
3,
4,
3,
4],
'Qu2':[
2,
3,
1,
2,
3],
'Qu3':[
1,
5,
2,
4,
4]})
data
Qu1Qu2Qu3
01211335241233244434
result=data.apply(pd.value_counts).fillna(
0)
result
Qu1Qu2Qu3
11.01.01.020.02.01.032.02.00.042.00.02.050.00.01.0
处理缺失数据
string_data=Series([
'aaedvark',
'artichoke',np.nan,
'avocado'])
string_data 0 aaedvark 1 artichoke 2 NaN 3 avocado dtype: object
string_data.isnull() 0 False 1 False 2 True 3 False dtype: bool
string_data[
0]=
None
string_data.isnull() 0 True 1 False 2 True 3 False dtype: bool ###滤除缺失数据
from numpy
import nan
as NA
import numpy
as np
import pandas
as pd
from pandas
import Series,DataFrame
data=Series([
1,NA,
3.5,NA,
7])
data.dropna() 0 1.0 2 3.5 4 7.0 dtype: float64
data[data.notnull()] 0 1.0 2 3.5 4 7.0 dtype: float64
data=DataFrame([[
1.,
6.5,
3.],[
1.,NA,NA],[NA,NA,NA],[NA,
6.5,
3.]])
clearned=data.dropna()
data
012
01.06.53.011.0NaNNaN2NaNNaNNaN3NaN6.53.0
clearned
012
01.06.53.0
data.dropna(how=
'all')
012
01.06.53.011.0NaNNaN3NaN6.53.0
data[
4]=NA
data
0124
01.06.53.0NaN11.0NaNNaNNaN2NaNNaNNaNNaN3NaN6.53.0NaN
data.dropna(axis=
1,how=
'all')
012
01.06.53.011.0NaNNaN2NaNNaNNaN3NaN6.53.0
df=DataFrame(np.random.randn(
7,
3))
df.ix[:
4,
1]=NA
df.ix[:
2,
2]=NA
df
012
0-1.637463NaNNaN1-1.259674NaNNaN2-0.284635NaNNaN30.818905NaN-1.8782444-2.402401NaN-0.5339425-0.623351-1.472599-0.8606146-0.194565-1.757851-1.251312
df.dropna(thresh=
3)
012
5-0.623351-1.472599-0.8606146-0.194565-1.757851-1.251312
填充缺失数据
df.fillna(
0)
012
0-1.6374630.0000000.0000001-1.2596740.0000000.0000002-0.2846350.0000000.00000030.8189050.000000-1.8782444-2.4024010.000000-0.5339425-0.623351-1.472599-0.8606146-0.194565-1.757851-1.251312
df.fillna({
1:
0.5,
2:-
1})
012
0-1.6374630.500000-1.0000001-1.2596740.500000-1.0000002-0.2846350.500000-1.00000030.8189050.500000-1.8782444-2.4024010.500000-0.5339425-0.623351-1.472599-0.8606146-0.194565-1.757851-1.251312
_=df.fillna(
0,inplace=
True)
df
012
0-1.6374630.0000000.0000001-1.2596740.0000000.0000002-0.2846350.0000000.00000030.8189050.000000-1.8782444-2.4024010.000000-0.5339425-0.623351-1.472599-0.8606146-0.194565-1.757851-1.251312
df=DataFrame(np.random.randn(
6,
3))
df.ix[
2:,
1]=NA
df.ix[
4:,
2]=NA
df
012
00.1737990.2674220.48014111.303258-0.429756-0.7906612-0.110613NaN0.87806231.188953NaN-0.1255614-0.512800NaNNaN5-0.383978NaNNaN
df.fillna(method=
'ffill')
012
00.1737990.2674220.48014111.303258-0.429756-0.7906612-0.110613-0.4297560.87806231.188953-0.429756-0.1255614-0.512800-0.429756-0.1255615-0.383978-0.429756-0.125561
df.fillna(method=
'ffill',limit=
2)
012
00.1737990.2674220.48014111.303258-0.429756-0.7906612-0.110613-0.4297560.87806231.188953-0.429756-0.1255614-0.512800NaN-0.1255615-0.383978NaN-0.125561
data=Series([
1.,NA,
3.5,NA,
7])
data.fillna(data.mean()) 0 1.000000 1 3.833333 2 3.500000 3 3.833333 4 7.000000 dtype: float64
层次化索引
data=Series(np.random.randn(
10),index=[[
'a',
'a',
'a',
'b',
'b',
'b',
'c',
'c',
'd',
'd'],[
1,
2,
3,
1,
2,
3,
1,
2,
2,
3]])
data a 1 -2.059265 2 0.276982 3 -1.771092 b 1 0.501535 2 1.547647 3 -0.038850 c 1 1.963156 2 -0.905470 d 2 -1.697117 3 -0.659792 dtype: float64
data.index MultiIndex(levels=[[‘a’, ‘b’, ‘c’, ‘d’], [1, 2, 3]], labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])
data[
'b'] 1 0.501535 2 1.547647 3 -0.038850 dtype: float64
data[
'b':
'c'] b 1 0.501535 2 1.547647 3 -0.038850 c 1 1.963156 2 -0.905470 dtype: float64
data.ix[[
'b',
'c']] b 1 0.501535 2 1.547647 3 -0.038850 c 1 1.963156 2 -0.905470 dtype: float64
data[:,
2] a 0.276982 b 1.547647 c -0.905470 d -1.697117 dtype: float64
data.unstack()
123
a-2.0592650.276982-1.771092b0.5015351.547647-0.038850c1.963156-0.905470NaNdNaN-1.697117-0.659792
data.unstack().stack() a 1 -2.059265 2 0.276982 3 -1.771092 b 1 0.501535 2 1.547647 3 -0.038850 c 1 1.963156 2 -0.905470 d 2 -1.697117 3 -0.659792 dtype: float64
frame=DataFrame(np.arange(
12).reshape((
4,
3)),index=[[
'a',
'a',
'b',
'b'],[
1,
2,
1,
2]],
columns=[[
'Ohio',
'Ohio',
'Colorado'],[
'Green',
'Red',
'Green']])
frame
OhioColoradoGreenRedGreen
a10122345b1678291011
frame.index.names=[
'key1',
'key2']
frame.columns.names=[
'state',
'color']
frame
stateOhioColoradocolorGreenRedGreenkey1key2
a10122345b1678291011
frame[
'Ohio']
colorGreenRedkey1key2
a101234b1672910
重排分级顺序
frame.swaplevel(
'key1',
'key2')
stateOhioColoradocolorGreenRedGreenkey2key1
1a0122a3451b6782b91011
frame.sortlevel(
1)
stateOhioColoradocolorGreenRedGreenkey1key2
a1012b1678a2345b291011
frame.swaplevel(
0,
1).sortlevel(
0)
stateOhioColoradocolorGreenRedGreenkey2key1
1a012b6782a345b91011
根据级别汇总统计
frame.sum(level=
'key2')
stateOhioColoradocolorGreenRedGreenkey2
168102121416
frame.sum(level=
'color',axis=
1)
colorGreenRedkey1key2
a121284b114722010
使用DataFrame的列
frame=DataFrame({
'a':range(
7),
'b':range(
7,
0,-
1),
'c':[
'one',
'one',
'one',
'two',
'two',
'two',
'two'],
'd':[
0,
1,
2,
0,
1,
2,
3]})
frame
abcd
007one0116one1225one2334two0443two1552two2661two3
frame2=frame.set_index([
'c',
'd'])
frame2
abcd
one007116225two034143252361
frame.set_index([
'c',
'd'],drop=
False)
abcdcd
one007one0116one1225one2two034two0143two1252two2361two3
frame2.reset_index()
cdab
0one0071one1162one2253two0344two1435two2526two361
其他有关pandas的话题
整数索引
ser=Series(np.arange(
3.))
ser
0 0.0
1 1.0
2 2.0
dtype: float64
ser[
1]
1.0
ser
ser2=Series(np.arange(
3.),index=[
'a',
'b',
'c'])
ser2[-
1]
2.0
ser.ix[:
1]
0 0.0
1 1.0
dtype: float64
ser3=Series(range(
3),index=[-
5,
1,
3])
ser3.iloc[
2]
2
frame=DataFrame(np.arange(
6).reshape(
3,
2),index=[
2,
0,
1])
frame.iloc[
0]
0 0
1 1
Name: 2, dtype: int32
面板数据
from pandas_datareader
import data
as web
pdata=pd.Panel(dict((stk,web.get_data_yahoo(stk,
'1/1/2009',
'6/1/2012') )
for stk
in [
'AAPL',
'GOOG',
'MSFT',
'DELL']))
pdata