示例数据:
>>> df1
c1 c2 v1 v2
0 A NaN 9 2
1 B NaN 2 5
2 C NaN 3 5
3 D NaN 4 2
>>> df2
c1 c2 v1 v2
0 A P 4 1
1 A T 3 1
2 A Y 2 0
3 B P 0 1
4 B T 2 2
5 B Y 0 2
6 C P 1 2
7 C T 1 2
8 C Y 1 1
9 D P 1 1
10 D T 2 0
11 D Y 1 1
df1需要在v1列上排序,然后df2需要在排序df1后根据c1列的值顺序排序,然后根据df2的v2列进行排序。
实现效果:
c1 c2 v1 v2
0 B NaN 2 5
1 B P 0 1
2 B T 2 2
3 B Y 0 2
4 C NaN 3 5
5 C Y 1 1
6 C P 1 2
7 C T 1 2
8 D NaN 4 2
9 D T 2 0
10 D P 1 1
11 D Y 1 1
12 A NaN 9 2
13 A Y 2 0
14 A P 4 1
15 A T 3 1
1、使用for循环实现
使用for循环实现比较简单容易理解,但如果数据量比较大就效率不是很高。
import pandas as pd from numpy import nan df1 = pd.DataFrame({ 'c1': ['A', 'B', 'C', 'D'], 'c2': [nan, nan, nan, nan], 'v1': [9, 2, 3, 4], 'v2': [2, 5, 5, 2] }) df2 = pd.DataFrame({ 'c1': ['A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C', 'D', 'D', 'D'], 'c2': ['P', 'T', 'Y', 'P', 'T', 'Y', 'P', 'T', 'Y', 'P', 'T', 'Y'], 'v1': [4, 3, 2, 0, 2, 0, 1, 1, 1, 1, 2, 1], 'v2': [1, 1, 0, 1, 2, 2, 2, 2, 1, 1, 0, 1] }) result = [] for i,row in df1.sort_values('v1').iterrows(): result.append(row.to_frame().T) result.append(df2[df2['c1'].eq(row['c1'])].sort_values('v2')) print(pd.concat(result, ignore_index=True))
2、使用groupby实现
import pandas as pd from numpy import nan import itertools df1 = pd.DataFrame({ 'c1': ['A', 'B', 'C', 'D'], 'c2': [nan, nan, nan, nan], 'v1': [9, 2, 3, 4], 'v2': [2, 5, 5, 2] }) df2 = pd.DataFrame({ 'c1': ['A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C', 'D', 'D', 'D'], 'c2': ['P', 'T', 'Y', 'P', 'T', 'Y', 'P', 'T', 'Y', 'P', 'T', 'Y'], 'v1': [4, 3, 2, 0, 2, 0, 1, 1, 1, 1, 2, 1], 'v2': [1, 1, 0, 1, 2, 2, 2, 2, 1, 1, 0, 1] }) out = pd.concat([df1.sort_values('v1'), df2.sort_values('v2')], ignore_index=True) print(out.loc[itertools.chain.from_iterable(out.groupby('c1', sort=False) .groups.values())])
3、使用dict实现
import pandas as pd from numpy import nan df1 = pd.DataFrame({ 'c1': ['A', 'B', 'C', 'D'], 'c2': [nan, nan, nan, nan], 'v1': [9, 2, 3, 4], 'v2': [2, 5, 5, 2] }) df2 = pd.DataFrame({ 'c1': ['A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C', 'D', 'D', 'D'], 'c2': ['P', 'T', 'Y', 'P', 'T', 'Y', 'P', 'T', 'Y', 'P', 'T', 'Y'], 'v1': [4, 3, 2, 0, 2, 0, 1, 1, 1, 1, 2, 1], 'v2': [1, 1, 0, 1, 2, 2, 2, 2, 1, 1, 0, 1] }) def mysort(df1,df2): d = dict(zip(df1['c1'],df1['v1'].rank())) o = pd.concat((df1,df2),keys=[1,2],names=['Key']) return (o.assign(k=o['c1'].map(d)).sort_values(['k','Key','v2']) .loc[:,list(df2)])#.reset_index(drop=True) print(mysort(df1,df2))
4、使用concatenate()和argsort()实现
import pandas as pd from numpy import nan import numpy as np df1 = pd.DataFrame({ 'c1': ['A', 'B', 'C', 'D'], 'c2': [nan, nan, nan, nan], 'v1': [9, 2, 3, 4], 'v2': [2, 5, 5, 2] }) df2 = pd.DataFrame({ 'c1': ['A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C', 'D', 'D', 'D'], 'c2': ['P', 'T', 'Y', 'P', 'T', 'Y', 'P', 'T', 'Y', 'P', 'T', 'Y'], 'v1': [4, 3, 2, 0, 2, 0, 1, 1, 1, 1, 2, 1], 'v2': [1, 1, 0, 1, 2, 2, 2, 2, 1, 1, 0, 1] }) def get_group_sort(group_i, k, v): ix = np.where(group_i == k)[0] ix1 = ix[1:] return np.r_[ix[0], ix1[np.argsort(v[ix1])]] def mysort(df1, df2): ng = len(df1) z = pd.concat([df1.sort_values('v1'), df2]).reset_index(drop=True) gb = z['v2'].groupby(z['c1'], sort=False) group_i = gb.ngroup().values v2 = z['v2'].values ix = np.concatenate([get_group_sort(group_i, k, v2) for k in range(ng)]) return z.iloc[ix] print(mysort(df1,df2))
5、使用reindex()和transform()等实现
import pandas as pd from numpy import nan import numpy as np df1 = pd.DataFrame({ 'c1': ['A', 'B', 'C', 'D'], 'c2': [nan, nan, nan, nan], 'v1': [9, 2, 3, 4], 'v2': [2, 5, 5, 2] }) df2 = pd.DataFrame({ 'c1': ['A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C', 'D', 'D', 'D'], 'c2': ['P', 'T', 'Y', 'P', 'T', 'Y', 'P', 'T', 'Y', 'P', 'T', 'Y'], 'v1': [4, 3, 2, 0, 2, 0, 1, 1, 1, 1, 2, 1], 'v2': [1, 1, 0, 1, 2, 2, 2, 2, 1, 1, 0, 1] }) df = pd.concat([df1, df2], ignore_index=True) print(df.reindex(df.sort_values('c1').groupby('c1', as_index=False)['v1'].transform('min').squeeze().sort_values().index).reset_index(drop=True))