Pandas中groupby高级用法

import pandas as pd
import numpy as np
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                           'foo', 'bar', 'foo', 'foo'],
                   'B' : ['one', 'one', 'two', 'three',
                          'two', 'two', 'one', 'three'],
                   'C' : np.random.randn(8),
                   'D' : np.random.randn(8)})
df

ABCD
0fooone-0.673452-0.252112
1barone2.029065-0.443464
2footwo1.307105-0.716860
3barthree-0.207770-2.731742
4footwo1.617284-0.772246
5bartwo-1.1996730.366827
6fooone-0.683761-1.710473
7foothree0.2693690.402087

单列分组

g=df.groupby('A')
g.count() #统计分组个数

BCD
A


bar333
foo555

多列分组

g=df.groupby(['A','B'])
g.count()


CD
AB

barone11
three11
two11
fooone22
three11
two22
<span class="hljs-keyword">def get_letter_type(letter):</span>
    if letter.lower() in 'aeiou': ##列名字是{a,e,i,o,u}的分成一组,新组名vowel
        return 'a'
    else:
        return 'b'
g=df.groupby(get_letter_type,axis=1)
g.count()

ab
013
113
213
313
413
513
613
713
df1=pd.Series([1,2,3,1,2,3],[7,8,9,9,8,7])
df1
7    1
8    2
9    3
9    1
8    2
7    3
dtype: int64
g=df1.groupby(level=0)
g.first()#查看分组后的第一行数据
7    1
8    2
9    3
dtype: int64
g.last() #查看分组后的最后一行数据
7    3
8    2
9    1
dtype: int64
g.sum() #对分组的各组进行求和
7    4
8    4
9    4
dtype: int64
g=df.groupby(level=0,sort=True)
g.first()

ABCD
0fooone-0.673452-0.252112
1barone2.029065-0.443464
2footwo1.307105-0.716860
3barthree-0.207770-2.731742
4footwo1.617284-0.772246
5bartwo-1.1996730.366827
6fooone-0.683761-1.710473
7foothree0.2693690.402087
df2=pd.DataFrame({'X':['a','b','b','a'],'Y':[1,2,3,4]})
df2

XY
0a1
1b2
2b3
3a4

根据X分组,获取a分组

g1=df2.groupby(['X']).get_group('a')
g1

XY
0a1
3a4
g2=df2.groupby(['X']).get_group('b')
g2

XY
1b2
2b3

MultiIndex表示多级索引,它是从Index继承过来的,其中多级标签用元组对象来表示。

arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
          ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
index=pd.MultiIndex.from_arrays(arrays,names=['index1','index2'])
data=pd.Series(np.random.randn(8),index=index)
data
index1  index2
bar     one       0.691435
        two       0.911557
baz     one      -0.055040
        two      -0.899152
foo     one       0.496195
        two       0.878070
qux     one      -0.348441
        two       0.936502
dtype: float64
g=data.groupby(level=0)
g.sum()
index1
bar    1.602991
baz   -0.954192
foo    1.374265
qux    0.588061
dtype: float64
g=data.groupby(level=1)
g.sum()
index2
one    0.784149
two    1.826977
dtype: float64
df

ABCD
0fooone-0.673452-0.252112
1barone2.029065-0.443464
2footwo1.307105-0.716860
3barthree-0.207770-2.731742
4footwo1.617284-0.772246
5bartwo-1.1996730.366827
6fooone-0.683761-1.710473
7foothree0.2693690.402087
df.groupby(['A','B']).sum()


CD
AB

barone2.029065-0.443464
three-0.207770-2.731742
two-1.1996730.366827
fooone-1.357213-1.962585
three0.2693690.402087
two2.924389-1.489106

reset_index重置索引或索引的级别。

df.groupby(['A','B']).sum().reset_index()

ABCD
0barone2.029065-0.443464
1barthree-0.207770-2.731742
2bartwo-1.1996730.366827
3fooone-1.357213-1.962585
4foothree0.2693690.402087
5footwo2.924389-1.489106
df.groupby(['A','B'],as_index=False).sum()

ABCD
0barone2.029065-0.443464
1barthree-0.207770-2.731742
2bartwo-1.1996730.366827
3fooone-1.357213-1.962585
4foothree0.2693690.402087
5footwo2.924389-1.489106
g=df.groupby(['A','B'])
g.size()
A    B    
bar  one      1
     three    1
     two      1
foo  one      2
     three    1
     two      2
dtype: int64
g.describe().head()


CD


countmeanstdmin25%50%75%maxcountmeanstdmin25%50%75%max
AB















barone1.02.029065NaN2.0290652.0290652.0290652.0290652.0290651.0-0.443464NaN-0.443464-0.443464-0.443464-0.443464-0.443464
three1.0-0.207770NaN-0.207770-0.207770-0.207770-0.207770-0.2077701.0-2.731742NaN-2.731742-2.731742-2.731742-2.731742-2.731742
two1.0-1.199673NaN-1.199673-1.199673-1.199673-1.199673-1.1996731.00.366827NaN0.3668270.3668270.3668270.3668270.366827
fooone2.0-0.6786070.007289-0.683761-0.681184-0.678607-0.676029-0.6734522.0-0.9812931.031217-1.710473-1.345883-0.981293-0.616702-0.252112
three1.00.269369NaN0.2693690.2693690.2693690.2693690.2693691.00.402087NaN0.4020870.4020870.4020870.4020870.402087
g=df.groupby('A')
g.sum()

CD
A

bar0.621622-2.808379
foo1.836544-3.049605
g['C'].agg([np.mean,np.std,np.sum])

meanstdsum
A


bar0.2072071.6538870.621622
foo0.3673091.0774021.836544
返回笔记列表
入门小站