# create和属性 s = pd.Series([1,2,3,3,np.nan,4,4,1]) print(s) dates = pd.date_range('20200101',periods=6) print("dates:\n",dates) df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d']) print(df) print(pd.DataFrame(np.random.randn(12).reshape((3,4)))) df2 = pd.DataFrame({'A':1., 'B':pd.Timestamp('20130102'), 'C':pd.Series(1,index=list(range(4)),dtype='float32'), 'D':np.array([3]*4,dtype='int32'), 'E':pd.Categorical(['test','train','test','train']), 'F':'foo'}) print(df2) print(df2.dtypes) print(df2.index) # 所有行的名称 print(df2.columns) # 所有列的名称 print(df2.values) print(df2.describe()) print(df2.T)
import pandas as pd df = pd.DataFrame(columns=['A']) for i in range(5): df.loc[len(df.index)] = [i] df
df = pd.DataFrame(columns=['clade','value']) df.append(dict(zip(df.columns,['a', 'b'])), ignore_index=True) df = pd.concat([df, pd.DataFrame.from_records([{ 'a': 1, 'b': 2 }])]) df.loc[len(df), ['a','b']] = 1, 2 df.loc[len(df), df.columns] = 3, 4
print(df2.sort_index(axis=1,ascending=False)) # 根据行进行排序 print(df2.sort_index(axis=0,ascending=False)) # 根据行进行排序 print(df2.sort_values(by='E')) # 根据值进行排序
## 选择数据
dates = pd.date_range('20200101',periods=6) df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D']) print(df) print(df['A']) print(df.A) print(df[0:3]) print(df['2020-01-02':'2020-01-04']) # select by lable:loc print(df.loc['20200102']) print(df.loc[:,['A','B']]) print(df.loc['20200103':,['A','B']]) # select by position:iloc print(df.iloc[3,1]) print(df.iloc[3:5,1:3]) print(df.iloc[[1,3,5],1:3]) # Boolean indexing print(df[df.A>8])
dates = pd.date_range('20130101',periods=6) df = pd.DataFrame(np.arange(24,dtype='float64').reshape((6,4)),index=dates,columns=['A','B','C','D']) print(df) df.iloc[2,2]=111 df.loc['20130101','B']=222 # df[df.A>4]=0 # A>4的所有行 # print(df) df.A[df.A>4] = 0 df.B[df.A>4] = 0 # print(df) df['F'] = np.nan df['E'] = pd.Series([1,2,3,4,5,6], index=dates) print(df)
dates = pd.date_range('20130101',periods=6) df = pd.DataFrame(np.arange(24,dtype='float64').reshape((6,4)),index=dates,columns=['A','B','C','D']) df.iloc[0,1] = np.nan df.iloc[1,2] = np.nan print(df) print(df.dropna(axis=0,how='any')) # how={'any','all'} print(df.fillna(value=0)) print(df.isnull()) print(np.any(df.isnull)==True) # 判断是否存在丢失数据
df.to_csv(file_name, sep='\t',index=False)
data = pd.read_csv("data/student.csv") data.to_pickle('data/student.pickle')
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d']) df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d']) df3 = pd.DataFrame(np.ones((3,4))*2, columns=['a','b','c','d']) res = pd.concat([df1,df2,df3],axis=0, ignore_index=True) res # join ={outer,inner} df1 = pd.DataFrame(np.ones((3,4))*0, columns=['b','c','d','e'], index=[1,2,3]) df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'],index =[2,3,4]) res = pd.concat([df1,df2], join='inner',ignore_index=True) res # appand df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d']) df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d']) df3 = pd.DataFrame(np.ones((3,4))*0, columns=['b','c','d','e'], index=[1,2,3]) res = df1.append(df2,ignore_index=True) res
left = pd.DataFrame({'name': ['YX', 'XY', 'XX', 'YY'], 'age': ['18', '18', '22', '19'], 'A': ['A0', 'A1', 'A2', 'A3'], 'B': ['B0', 'B1', 'B2', 'B3']}) right = pd.DataFrame({'name': ['YX', 'XY', 'XX', 'yx'], 'age': ['18', '20', '22', '19'], 'C': ['C0', 'C1', 'C2', 'C3'], 'D': ['D0', 'D1', 'D2', 'D3']}) pd.merge(left, right,how='outer', on=['name', 'age'])
left = pd.DataFrame({'name': ['YX', 'XY', 'XX', 'YY'], 'age': ['18', '18', '22', '19'], 'A': ['A0', 'A1', 'A2', 'A3'], 'B': ['B0', 'B1', 'B2', 'B3']}) right = pd.DataFrame({'name': ['YX', 'XY', 'XX', 'yx'], 'age': ['18', '20', '22', '19'], 'C': ['C0', 'C1', 'C2', 'C3'], 'D': ['D0', 'D1', 'D2', 'D3']}) middle = pd.DataFrame({'name': ['YX', 'XY', 'xx', 'yx'], 'age': ['18', '20', '22', '19'], 'E': ['E0', 'E1', 'E2', 'E3'], 'F': ['F0', 'F1', 'F2', 'F3']}) # pd.merge(left, right,how='outer', on=['name', 'age']) l1 = [left,middle,right] reduce(lambda x, y: pd.merge(x,y,how='outer', on=['name', 'age']),l1 )
import pandas as pd df = pd.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'c']}) for row in df.itertuples(): print(row.Index, row.A, row.B)
0 1 a 1 2 b 2 3 c
import pandas as pd df = pd.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'c']}) for index, row in df.iterrows(): print(index, row['A'], row['B'])
# array([[0., 0., 0., ..., 0., 0., 0.]]) from joblib import load gmwi2 = load("/data/workspace/1/pipeline/DetectionOfIntestinalBacteria/bin/GMWI2_model.joblib") pd.DataFrame(gmwi2.coef_!=0,index=['coef']).T.groupby("coef").size()
pd.DataFrame.from_dict({"row1":{"A":1,"B":4},"row2":{"A":2,"B":5},"row3":{"A":3,"B":6}}, orient='index')
A B row1 1 4 row2 2 5 row3 3