pd.DataFrame()函数

pd.DataFrame()函数

import pandas as pd
import numpy as np

data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, np.nan],  # np.nan表示NA
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
pd.DataFrame(data,
#           index=['a','b','c','d','e']
#           index = range(5)
         )  # 默认生成整数索引, 字典的键作列,值作行

stateyearpop
0Ohio2000.01.5
1Ohio2001.01.7
2Ohio2002.03.6
3Nevada2001.02.4
4NevadaNaN2.9
# 两层嵌套
d = {'a': {'tp': 26, 'fp': 112},
     'b': {'tp': 26, 'fp': 91},
     'c': {'tp': 23, 'fp': 74}}
df_index = pd.DataFrame.from_dict(d, orient='index')
df_index

tpfp
a26112
b2691
c2374
df_columns = pd.DataFrame.from_dict(d,orient='columns')
df_columns

abc
tp262623
fp1129174
# 通过传递一个numpy array,时间索引以及列标签来创建一个DataFrame
data = pd.DataFrame(np.arange(10,26).reshape((4, 4)),
                 index=['Ohio', 'Colorado', 'Utah', 'New York'], 
                 columns=['one', 'two', 'three', 'four'])
data

onetwothreefour
Ohio10111213
Colorado14151617
Utah18192021
New York22232425
np.random.seed(10)
dates = pd.date_range('20190101', periods=6)
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df
ABCD
2019-01-011.3315870.715279-1.545400-0.008384
2019-01-020.621336-0.7200860.2655120.108549
2019-01-030.004291-0.1746000.4330261.203037
2019-01-04-0.9650661.0282740.2286300.445138
2019-01-05-1.1366020.1351371.484537-1.079805
2019-01-06-1.977728-1.7433720.2660702.384967

DataFrame基本属性

#  DataFrame.index: The index (row labels) of the DataFrame.

df.index
DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
               '2019-01-05', '2019-01-06'],
              dtype='datetime64[ns]', freq='D')
# 设置索引名
df.index.name = 'time'
df
ABCD
time
2019-01-011.3315870.715279-1.545400-0.008384
2019-01-020.621336-0.7200860.2655120.108549
2019-01-030.004291-0.1746000.4330261.203037
2019-01-04-0.9650661.0282740.2286300.445138
2019-01-05-1.1366020.1351371.484537-1.079805
2019-01-06-1.977728-1.7433720.2660702.384967
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')
df.columns.name = 'alphabet'

df
alphabetABCD
time
2019-01-011.3315870.715279-1.545400-0.008384
2019-01-020.621336-0.7200860.2655120.108549
2019-01-030.004291-0.1746000.4330261.203037
2019-01-04-0.9650661.0282740.2286300.445138
2019-01-05-1.1366020.1351371.484537-1.079805
2019-01-06-1.977728-1.7433720.2660702.384967
# 查看底层的Numpy数据
df.values

array([[ 1.3315865 ,  0.71527897, -1.54540029, -0.00838385],
       [ 0.62133597, -0.72008556,  0.26551159,  0.10854853],
       [ 0.00429143, -0.17460021,  0.43302619,  1.20303737],
       [-0.96506567,  1.02827408,  0.22863013,  0.44513761],
       [-1.13660221,  0.13513688,  1.484537  , -1.07980489],
       [-1.97772828, -1.7433723 ,  0.26607016,  2.38496733]])
# DataFrame索引
df.head(3)  # 显示前三行
alphabetABCD
time
2019-01-011.3315870.715279-1.545400-0.008384
2019-01-020.621336-0.7200860.2655120.108549
2019-01-030.004291-0.1746000.4330261.203037
df.tail(3)   # 显示后三行
alphabetABCD
time
2019-01-04-0.9650661.0282740.2286300.445138
2019-01-05-1.1366020.1351371.484537-1.079805
2019-01-06-1.977728-1.7433720.2660702.384967
df = pd.DataFrame({'a': range(7), 'b': range(7, 0, -1),
                      'c': ['one', 'one', 'one', 'two', 'two',
                            'two', 'two'],
                      'd': [0, 1, 2, 0, 1, 2, 3]})
df


abcd
007one0
116one1
225one2
334two0
443two1
552two2
661two3
# set_index方法将DataFrame的一个或者多个列转化为行索引
df2 = df.set_index(['c', 'd'])
df2
ab
cd
one007
116
225
two034
143
252
361
# 默认drop = True,当drop=False 不删除原始数据
df.set_index(['c', 'd'], drop=False)
abcd
cd
one007one0
116one1
225one2
two034two0
143two1
252two2
361two3
# -reset_index的功能和set_index的刚好相反,层次化索引的级别会被转移到列里面
df2.reset_index()
cdab
0one007
1one116
2one225
3two034
4two143
5two252
6two361

DataFrame计算、描述性统计

# 显示数字保留两位小数
np.random.seed(10)
dates = pd.date_range('20190101', periods=6)
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df.round(2)
ABCD
2019-01-011.330.72-1.55-0.01
2019-01-020.62-0.720.270.11
2019-01-030.00-0.170.431.20
2019-01-04-0.971.030.230.45
2019-01-05-1.140.141.48-1.08
2019-01-06-1.98-1.740.272.38
# 不同的列制定不同的小数位数
df.round({'A': 1, 'C': 2})
ABCD
2019-01-011.30.715279-1.55-0.008384
2019-01-020.6-0.7200860.270.108549
2019-01-030.0-0.1746000.431.203037
2019-01-04-1.01.0282740.230.445138
2019-01-05-1.10.1351371.48-1.079805
2019-01-06-2.0-1.7433720.272.384967
# 数值型数据的快速统计汇总
df.describe()
ABCD
count6.0000006.0000006.0000006.000000
mean-0.353697-0.1265610.1887290.508917
std1.2282681.0079170.9756511.179607
min-1.977728-1.743372-1.545400-1.079805
25%-1.093718-0.5837140.2378500.020849
50%-0.480387-0.0197320.2657910.276843
75%0.4670750.5702430.3912871.013562
max1.3315871.0282741.4845372.384967
df

ABCD
2019-01-011.3315870.715279-1.545400-0.008384
2019-01-020.621336-0.7200860.2655120.108549
2019-01-030.004291-0.1746000.4330261.203037
2019-01-04-0.9650661.0282740.2286300.445138
2019-01-05-1.1366020.1351371.484537-1.079805
2019-01-06-1.977728-1.7433720.2660702.384967
df.apply(np.cumsum, axis=0, result_type=None )
#第一行 不变
# 第二行=第一行+第二行
#。。。。。。 
ABCD
2019-01-011.3315870.715279-1.545400-0.008384
2019-01-021.952922-0.004807-1.2798890.100165
2019-01-031.957214-0.179407-0.8468631.303202
2019-01-040.9921480.848867-0.6182321.748340
2019-01-05-0.1444540.9840040.8663050.668535
2019-01-06-2.122182-0.7593681.1323753.053502
df.apply(lambda x: x.max() - x.min())  # 每一列的极差

A    3.309315
B    2.771646
C    3.029937
D    3.464772
dtype: float64

重新索引、选择、标签操作

# 修改列名
df.rename(columns = {'A':'key2'},inplace=False)
key2BCD
2019-01-011.3315870.715279-1.545400-0.008384
2019-01-020.621336-0.7200860.2655120.108549
2019-01-030.004291-0.1746000.4330261.203037
2019-01-04-0.9650661.0282740.2286300.445138
2019-01-05-1.1366020.1351371.484537-1.079805
2019-01-06-1.977728-1.7433720.2660702.384967

排序

# 默认axis=0,按行索引对行进行排序;ascending=True,升序排序
df.sort_index(axis=0, ascending=False)
# df.sort_index(axis=0, ascending=True)

key2BCD
2019-01-06-1.977728-1.7433720.2660702.384967
2019-01-05-1.1366020.1351371.484537-1.079805
2019-01-04-0.9650661.0282740.2286300.445138
2019-01-030.004291-0.1746000.4330261.203037
2019-01-020.621336-0.7200860.2655120.108549
2019-01-011.3315870.715279-1.545400-0.008384

版权声明:本文为happy_wealthy原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。