pandas tutorial

最后发布时间:2024-12-24 10:13:45 浏览量:

create和attitude

# create和属性
s = pd.Series([1,2,3,3,np.nan,4,4,1])
print(s)

dates = pd.date_range('20200101',periods=6)
print("dates:\n",dates)

df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])
print(df)
print(pd.DataFrame(np.random.randn(12).reshape((3,4))))

df2 = pd.DataFrame({'A':1.,
                   'B':pd.Timestamp('20130102'),
                   'C':pd.Series(1,index=list(range(4)),dtype='float32'),
                   'D':np.array([3]*4,dtype='int32'),
                   'E':pd.Categorical(['test','train','test','train']),
                   'F':'foo'})
print(df2)
print(df2.dtypes) 
print(df2.index) # 所有行的名称
print(df2.columns) # 所有列的名称
print(df2.values)
print(df2.describe())
print(df2.T)

添加新的一行

import pandas as pd
df = pd.DataFrame(columns=['A'])
for i in range(5):
    df.loc[len(df.index)] = [i]
df
df = pd.DataFrame(columns=['clade','value'])
df.append(dict(zip(df.columns,['a', 'b'])), ignore_index=True)
df = pd.concat([df, pd.DataFrame.from_records([{ 'a': 1, 'b': 2 }])])
df.loc[len(df), ['a','b']] = 1, 2
df.loc[len(df), df.columns] = 3, 4

排序

print(df2.sort_index(axis=1,ascending=False)) # 根据行进行排序
print(df2.sort_index(axis=0,ascending=False)) # 根据行进行排序
print(df2.sort_values(by='E')) # 根据值进行排序

## 选择数据

dates = pd.date_range('20200101',periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
print(df)
print(df['A'])
print(df.A)
print(df[0:3])
print(df['2020-01-02':'2020-01-04'])

# select by lable:loc
print(df.loc['20200102'])
print(df.loc[:,['A','B']])
print(df.loc['20200103':,['A','B']])


# select by position:iloc
print(df.iloc[3,1])
print(df.iloc[3:5,1:3])
print(df.iloc[[1,3,5],1:3])

# Boolean indexing
print(df[df.A>8])

设置值

dates = pd.date_range('20130101',periods=6)
df = pd.DataFrame(np.arange(24,dtype='float64').reshape((6,4)),index=dates,columns=['A','B','C','D'])
print(df)
df.iloc[2,2]=111
df.loc['20130101','B']=222
# df[df.A>4]=0 # A>4的所有行
# print(df)
df.A[df.A>4] = 0
df.B[df.A>4] = 0
# print(df)
df['F'] = np.nan
df['E'] = pd.Series([1,2,3,4,5,6], index=dates)
print(df)

处理丢失数据

dates = pd.date_range('20130101',periods=6)
df = pd.DataFrame(np.arange(24,dtype='float64').reshape((6,4)),index=dates,columns=['A','B','C','D'])
df.iloc[0,1] = np.nan
df.iloc[1,2] = np.nan

print(df)
print(df.dropna(axis=0,how='any')) # how={'any','all'}
print(df.fillna(value=0))
print(df.isnull())
print(np.any(df.isnull)==True) # 判断是否存在丢失数据

pandas 导入导出

df.to_csv(file_name, sep='\t',index=False)
data = pd.read_csv("data/student.csv")
data.to_pickle('data/student.pickle')

合并concat

df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*2, columns=['a','b','c','d'])

res = pd.concat([df1,df2,df3],axis=0, ignore_index=True)
res
# join ={outer,inner}
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['b','c','d','e'], index=[1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'],index =[2,3,4])
res = pd.concat([df1,df2], join='inner',ignore_index=True)
res
# appand
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*0, columns=['b','c','d','e'], index=[1,2,3])
res = df1.append(df2,ignore_index=True)
res
left = pd.DataFrame({'name': ['YX', 'XY', 'XX', 'YY'],
                        'age': ['18', '18', '22', '19'],
                        'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'name': ['YX', 'XY', 'XX', 'yx'],
                        'age': ['18', '20', '22', '19'],
                        'C': ['C0', 'C1', 'C2', 'C3'],
                        'D': ['D0', 'D1', 'D2', 'D3']})
pd.merge(left, right,how='outer', on=['name', 'age'])
left = pd.DataFrame({'name': ['YX', 'XY', 'XX', 'YY'],
                        'age': ['18', '18', '22', '19'],
                        'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'name': ['YX', 'XY', 'XX', 'yx'],
                        'age': ['18', '20', '22', '19'],
                        'C': ['C0', 'C1', 'C2', 'C3'],
                        'D': ['D0', 'D1', 'D2', 'D3']})

middle = pd.DataFrame({'name': ['YX', 'XY', 'xx', 'yx'],
                        'age': ['18', '20', '22', '19'],
                        'E': ['E0', 'E1', 'E2', 'E3'],
                        'F': ['F0', 'F1', 'F2', 'F3']})
# pd.merge(left, right,how='outer', on=['name', 'age'])

l1 = [left,middle,right]

reduce(lambda x, y: pd.merge(x,y,how='outer', on=['name', 'age']),l1 )

遍历

import pandas as pd

df = pd.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'c']})

for row in df.itertuples():
    print(row.Index, row.A, row.B)
0 1 a
1 2 b
2 3 c
import pandas as pd

df = pd.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'c']})

for index, row in df.iterrows():
    print(index, row['A'], row['B'])
0 1 a
1 2 b
2 3 c

分组统计

# array([[0., 0., 0., ..., 0., 0., 0.]])
from joblib import load
gmwi2 = load("/data/workspace/1/pipeline/DetectionOfIntestinalBacteria/bin/GMWI2_model.joblib")
pd.DataFrame(gmwi2.coef_!=0,index=['coef']).T.groupby("coef").size()
pd.DataFrame.from_dict({"row1":{"A":1,"B":4},"row2":{"A":2,"B":5},"row3":{"A":3,"B":6}}, orient='index')
A	B
row1	1	4
row2	2	5
row3	3