pd.DataFrame()函数
import pandas as pd
import numpy as np
data = { 'state' : [ 'Ohio' , 'Ohio' , 'Ohio' , 'Nevada' , 'Nevada' ] ,
'year' : [ 2000 , 2001 , 2002 , 2001 , np. nan] ,
'pop' : [ 1.5 , 1.7 , 3.6 , 2.4 , 2.9 ] }
pd. DataFrame( data,
)
state year pop 0 Ohio 2000.0 1.5 1 Ohio 2001.0 1.7 2 Ohio 2002.0 3.6 3 Nevada 2001.0 2.4 4 Nevada NaN 2.9
d = { 'a' : { 'tp' : 26 , 'fp' : 112 } ,
'b' : { 'tp' : 26 , 'fp' : 91 } ,
'c' : { 'tp' : 23 , 'fp' : 74 } }
df_index = pd. DataFrame. from_dict( d, orient= 'index' )
df_index
df_columns = pd. DataFrame. from_dict( d, orient= 'columns' )
df_columns
data = pd. DataFrame( np. arange( 10 , 26 ) . reshape( ( 4 , 4 ) ) ,
index= [ 'Ohio' , 'Colorado' , 'Utah' , 'New York' ] ,
columns= [ 'one' , 'two' , 'three' , 'four' ] )
data
one two three four Ohio 10 11 12 13 Colorado 14 15 16 17 Utah 18 19 20 21 New York 22 23 24 25
np. random. seed( 10 )
dates = pd. date_range( '20190101' , periods= 6 )
df = pd. DataFrame( np. random. randn( 6 , 4 ) , index= dates, columns= list ( 'ABCD' ) )
df
A B C D 2019-01-01 1.331587 0.715279 -1.545400 -0.008384 2019-01-02 0.621336 -0.720086 0.265512 0.108549 2019-01-03 0.004291 -0.174600 0.433026 1.203037 2019-01-04 -0.965066 1.028274 0.228630 0.445138 2019-01-05 -1.136602 0.135137 1.484537 -1.079805 2019-01-06 -1.977728 -1.743372 0.266070 2.384967
DataFrame基本属性
df. index
DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
'2019-01-05', '2019-01-06'],
dtype='datetime64[ns]', freq='D')
df. index. name = 'time'
df
A B C D time 2019-01-01 1.331587 0.715279 -1.545400 -0.008384 2019-01-02 0.621336 -0.720086 0.265512 0.108549 2019-01-03 0.004291 -0.174600 0.433026 1.203037 2019-01-04 -0.965066 1.028274 0.228630 0.445138 2019-01-05 -1.136602 0.135137 1.484537 -1.079805 2019-01-06 -1.977728 -1.743372 0.266070 2.384967
df. columns
Index(['A', 'B', 'C', 'D'], dtype='object')
df. columns. name = 'alphabet'
df
alphabet A B C D time 2019-01-01 1.331587 0.715279 -1.545400 -0.008384 2019-01-02 0.621336 -0.720086 0.265512 0.108549 2019-01-03 0.004291 -0.174600 0.433026 1.203037 2019-01-04 -0.965066 1.028274 0.228630 0.445138 2019-01-05 -1.136602 0.135137 1.484537 -1.079805 2019-01-06 -1.977728 -1.743372 0.266070 2.384967
df. values
array([[ 1.3315865 , 0.71527897, -1.54540029, -0.00838385],
[ 0.62133597, -0.72008556, 0.26551159, 0.10854853],
[ 0.00429143, -0.17460021, 0.43302619, 1.20303737],
[-0.96506567, 1.02827408, 0.22863013, 0.44513761],
[-1.13660221, 0.13513688, 1.484537 , -1.07980489],
[-1.97772828, -1.7433723 , 0.26607016, 2.38496733]])
df. head( 3 )
alphabet A B C D time 2019-01-01 1.331587 0.715279 -1.545400 -0.008384 2019-01-02 0.621336 -0.720086 0.265512 0.108549 2019-01-03 0.004291 -0.174600 0.433026 1.203037
df. tail( 3 )
alphabet A B C D time 2019-01-04 -0.965066 1.028274 0.228630 0.445138 2019-01-05 -1.136602 0.135137 1.484537 -1.079805 2019-01-06 -1.977728 -1.743372 0.266070 2.384967
df = pd. DataFrame( { 'a' : range ( 7 ) , 'b' : range ( 7 , 0 , - 1 ) ,
'c' : [ 'one' , 'one' , 'one' , 'two' , 'two' ,
'two' , 'two' ] ,
'd' : [ 0 , 1 , 2 , 0 , 1 , 2 , 3 ] } )
df
a b c d 0 0 7 one 0 1 1 6 one 1 2 2 5 one 2 3 3 4 two 0 4 4 3 two 1 5 5 2 two 2 6 6 1 two 3
df2 = df. set_index( [ 'c' , 'd' ] )
df2
a b c d one 0 0 7 1 1 6 2 2 5 two 0 3 4 1 4 3 2 5 2 3 6 1
df. set_index( [ 'c' , 'd' ] , drop= False )
a b c d c d one 0 0 7 one 0 1 1 6 one 1 2 2 5 one 2 two 0 3 4 two 0 1 4 3 two 1 2 5 2 two 2 3 6 1 two 3
df2. reset_index( )
c d a b 0 one 0 0 7 1 one 1 1 6 2 one 2 2 5 3 two 0 3 4 4 two 1 4 3 5 two 2 5 2 6 two 3 6 1
DataFrame计算、描述性统计
np. random. seed( 10 )
dates = pd. date_range( '20190101' , periods= 6 )
df = pd. DataFrame( np. random. randn( 6 , 4 ) , index= dates, columns= list ( 'ABCD' ) )
df. round ( 2 )
A B C D 2019-01-01 1.33 0.72 -1.55 -0.01 2019-01-02 0.62 -0.72 0.27 0.11 2019-01-03 0.00 -0.17 0.43 1.20 2019-01-04 -0.97 1.03 0.23 0.45 2019-01-05 -1.14 0.14 1.48 -1.08 2019-01-06 -1.98 -1.74 0.27 2.38
df. round ( { 'A' : 1 , 'C' : 2 } )
A B C D 2019-01-01 1.3 0.715279 -1.55 -0.008384 2019-01-02 0.6 -0.720086 0.27 0.108549 2019-01-03 0.0 -0.174600 0.43 1.203037 2019-01-04 -1.0 1.028274 0.23 0.445138 2019-01-05 -1.1 0.135137 1.48 -1.079805 2019-01-06 -2.0 -1.743372 0.27 2.384967
df. describe( )
A B C D count 6.000000 6.000000 6.000000 6.000000 mean -0.353697 -0.126561 0.188729 0.508917 std 1.228268 1.007917 0.975651 1.179607 min -1.977728 -1.743372 -1.545400 -1.079805 25% -1.093718 -0.583714 0.237850 0.020849 50% -0.480387 -0.019732 0.265791 0.276843 75% 0.467075 0.570243 0.391287 1.013562 max 1.331587 1.028274 1.484537 2.384967
df
A B C D 2019-01-01 1.331587 0.715279 -1.545400 -0.008384 2019-01-02 0.621336 -0.720086 0.265512 0.108549 2019-01-03 0.004291 -0.174600 0.433026 1.203037 2019-01-04 -0.965066 1.028274 0.228630 0.445138 2019-01-05 -1.136602 0.135137 1.484537 -1.079805 2019-01-06 -1.977728 -1.743372 0.266070 2.384967
df. apply ( np. cumsum, axis= 0 , result_type= None )
A B C D 2019-01-01 1.331587 0.715279 -1.545400 -0.008384 2019-01-02 1.952922 -0.004807 -1.279889 0.100165 2019-01-03 1.957214 -0.179407 -0.846863 1.303202 2019-01-04 0.992148 0.848867 -0.618232 1.748340 2019-01-05 -0.144454 0.984004 0.866305 0.668535 2019-01-06 -2.122182 -0.759368 1.132375 3.053502
df. apply ( lambda x: x. max ( ) - x. min ( ) )
A 3.309315
B 2.771646
C 3.029937
D 3.464772
dtype: float64
重新索引、选择、标签操作
df. rename( columns = { 'A' : 'key2' } , inplace= False )
key2 B C D 2019-01-01 1.331587 0.715279 -1.545400 -0.008384 2019-01-02 0.621336 -0.720086 0.265512 0.108549 2019-01-03 0.004291 -0.174600 0.433026 1.203037 2019-01-04 -0.965066 1.028274 0.228630 0.445138 2019-01-05 -1.136602 0.135137 1.484537 -1.079805 2019-01-06 -1.977728 -1.743372 0.266070 2.384967
排序
df. sort_index( axis= 0 , ascending= False )
key2 B C D 2019-01-06 -1.977728 -1.743372 0.266070 2.384967 2019-01-05 -1.136602 0.135137 1.484537 -1.079805 2019-01-04 -0.965066 1.028274 0.228630 0.445138 2019-01-03 0.004291 -0.174600 0.433026 1.203037 2019-01-02 0.621336 -0.720086 0.265512 0.108549 2019-01-01 1.331587 0.715279 -1.545400 -0.008384