Pandas入门教程(四)

groupby分组

  1. import pandas as pd
  2. import numpy as np
  3. df=pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
  4. 'foo', 'bar', 'foo', 'foo'],
  5. 'B' : ['one', 'one', 'two', 'three',
  6. 'two', 'two', 'one', 'three'],
  7. 'C' : np.random.randn(8),
  8. 'D' : np.random.randn(8)})
  9. print(df)
  10. grouped=df.groupby('A')
  11. print('-'*30)
  12. print(grouped.count())
  13. print('-'*30)
  14. grouped=df.groupby(['A','B'])
  15. print(grouped.count())
  16. print('-'*30)
  17. # 通过函数分组
  18. def get_letter_type(letter):
  19. if letter.lower() in 'aeiou':
  20. return 'a'
  21. else:
  22. return 'b'
  23. grouped=df.groupby(get_letter_type,axis=1)
  24. print(grouped.count())
  1. A B C D
  2. 0 foo one 1.429387 0.643569
  3. 1 bar one -0.858448 -0.213034
  4. 2 foo two 0.375644 0.214584
  5. 3 bar three 0.042284 -0.330481
  6. 4 foo two -1.421967 0.768176
  7. 5 bar two 1.293483 -0.399003
  8. 6 foo one -1.101385 -0.236341
  9. 7 foo three -0.852603 -1.718694
  10. ------------------------------
  11. B C D
  12. A
  13. bar 3 3 3
  14. foo 5 5 5
  15. ------------------------------
  16. C D
  17. A B
  18. bar one 1 1
  19. three 1 1
  20. two 1 1
  21. foo one 2 2
  22. three 1 1
  23. two 2 2
  24. ------------------------------
  25. a b
  26. 0 1 3
  27. 1 1 3
  28. 2 1 3
  29. 3 1 3
  30. 4 1 3
  31. 5 1 3
  32. 6 1 3
  33. 7 1 3
  1. se=pd.Series([1,2,3,4,5],[6,9,8,9,8])
  2. print(se)
  3. se.groupby(level=0)
  1. 6 1
  2. 9 2
  3. 8 3
  4. 9 4
  5. 8 5
  6. dtype: int64
  7. <pandas.core.groupby.generic.SeriesGroupBy object at 0x111b00040>
  1. # 分组求和
  2. grouped=se.groupby(level=0).sum()
  3. print(grouped)
  1. 6 1
  2. 8 8
  3. 9 6
  4. dtype: int64
  1. df2=pd.DataFrame({'X':['A','B','A','B'],'Y':[1,2,3,4]})
  2. print(df2)
  1. X Y
  2. 0 A 1
  3. 1 B 2
  4. 2 A 3
  5. 3 B 4
  1. # 按X分组,并查询A列的数据
  2. grp=df2.groupby('X').get_group('A')
  3. print(grp)
  1. X Y
  2. 0 A 1
  3. 2 A 3

Pandas 多级索引

  1. arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
  2. ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
  3. index=pd.MultiIndex.from_arrays(arrays,names=['first','second'])
  4. print(index)
  1. MultiIndex([('bar', 'one'),
  2. ('bar', 'two'),
  3. ('baz', 'one'),
  4. ('baz', 'two'),
  5. ('foo', 'one'),
  6. ('foo', 'two'),
  7. ('qux', 'one'),
  8. ('qux', 'two')],
  9. names=['first', 'second'])
  1. s=pd.Series(np.random.randn(8),index=index)
  2. print(s)
  1. first second
  2. bar one 0.120979
  3. two -0.440384
  4. baz one 0.515106
  5. two -0.019882
  6. foo one 1.149595
  7. two -0.369984
  8. qux one -0.930438
  9. two 0.146044
  10. dtype: float64
  1. # 分组求和
  2. grouped=s.groupby(level='first')
  3. print(grouped.sum())
  1. first
  2. bar -0.319405
  3. baz 0.495224
  4. foo 0.779611
  5. qux -0.784394
  6. dtype: float64
  1. grouped=df.groupby(['A','B'])
  2. print(grouped.size())
  1. A B
  2. bar one 1
  3. three 1
  4. two 1
  5. foo one 2
  6. three 1
  7. two 2
  8. dtype: int64
  1. print(df)
  1. A B C D
  2. 0 foo one 1.429387 0.643569
  3. 1 bar one -0.858448 -0.213034
  4. 2 foo two 0.375644 0.214584
  5. 3 bar three 0.042284 -0.330481
  6. 4 foo two -1.421967 0.768176
  7. 5 bar two 1.293483 -0.399003
  8. 6 foo one -1.101385 -0.236341
  9. 7 foo three -0.852603 -1.718694
  1. print(grouped.describe().head())
  1. C \
  2. count mean std min 25% 50% 75%
  3. A B
  4. bar one 1.0 -0.858448 NaN -0.858448 -0.858448 -0.858448 -0.858448
  5. three 1.0 0.042284 NaN 0.042284 0.042284 0.042284 0.042284
  6. two 1.0 1.293483 NaN 1.293483 1.293483 1.293483 1.293483
  7. foo one 2.0 0.164001 1.789526 -1.101385 -0.468692 0.164001 0.796694
  8. three 1.0 -0.852603 NaN -0.852603 -0.852603 -0.852603 -0.852603
  9. D \
  10. max count mean std min 25% 50%
  11. A B
  12. bar one -0.858448 1.0 -0.213034 NaN -0.213034 -0.213034 -0.213034
  13. three 0.042284 1.0 -0.330481 NaN -0.330481 -0.330481 -0.330481
  14. two 1.293483 1.0 -0.399003 NaN -0.399003 -0.399003 -0.399003
  15. foo one 1.429387 2.0 0.203614 0.622191 -0.236341 -0.016364 0.203614
  16. three -0.852603 1.0 -1.718694 NaN -1.718694 -1.718694 -1.718694


75% max
A B
bar one -0.213034 -0.213034
three -0.330481 -0.330481
two -0.399003 -0.399003
foo one 0.423592 0.643569
three -1.718694 -1.718694

  1. grouped=df.groupby('A')
  2. grouped['C'].agg([np.sum,np.mean,np.std])
sum mean std
A
bar 0.477319 0.159106 1.080712
foo -1.570925 -0.314185 1.188767

字符串操作

  1. import pandas as pd
  2. import numpy as np
  3. s=pd.Series(['A','b','c','D',np.nan])
  4. print(s)
  5. # 转小写
  6. print(s.str.lower())
  7. # 转大写
  8. print(s.str.upper())
  9. # 每个字符的长度
  10. print(s.str.len())
  1. 0 A
  2. 1 b
  3. 2 c
  4. 3 D
  5. 4 NaN
  6. dtype: object
  7. 0 a
  8. 1 b
  9. 2 c
  10. 3 d
  11. 4 NaN
  12. dtype: object
  13. 0 A
  14. 1 B
  15. 2 C
  16. 3 D
  17. 4 NaN
  18. dtype: object
  19. 0 1.0
  20. 1 1.0
  21. 2 1.0
  22. 3 1.0
  23. 4 NaN
  24. dtype: float64
  1. index=pd.Index([' Index','ru ',' men'])
  2. # 去掉空格
  3. print(index.str.strip())
  4. # 去掉左边的空格
  5. print(index.str.lstrip())
  6. # 去掉右边的空格
  7. print(index.str.rstrip())
  1. Index(['Index', 'ru', 'men'], dtype='object')
  2. Index(['Index', 'ru ', 'men'], dtype='object')
  3. Index([' Index', 'ru', ' men'], dtype='object')
  1. df=pd.DataFrame(np.random.randn(3,2),columns=['A a','B b'],index=range(3))
  2. print(df)
  1. A a B b
  2. 0 3.005273 0.486696
  3. 1 1.093889 1.054230
  4. 2 -2.846352 0.302465
  1. # 列替换
  2. print(df.columns.str.replace(' ','_'))
  1. Index(['A_a', 'B_b'], dtype='object')
  1. s=pd.Series(['a_b_C','c_d_e','f_g_h'])
  2. print(s)
  1. 0 a_b_C
  2. 1 c_d_e
  3. 2 f_g_h
  4. dtype: object
  1. print(s.str.split('_'))
  1. 0 [a, b, C]
  2. 1 [c, d, e]
  3. 2 [f, g, h]
  4. dtype: object
  1. print(s.str.split('_',expand=True,n=1))
  1. 0 1
  2. 0 a b_C
  3. 1 c d_e
  4. 2 f g_h
  1. s = pd.Series(['A','rumen','ru','rumen','xiao','zhan'])
  1. print(s.str.contains('ru'))
  1. 0 False
  2. 1 True
  3. 2 True
  4. 3 True
  5. 4 False
  6. 5 False
  7. dtype: bool
  1. s=pd.Series(['a','a|b','a|c'])
  2. print(s)
  1. 0 a
  2. 1 a|b
  3. 2 a|c
  4. dtype: object
  1. print(s.str.get_dummies(sep='|'))
  1. a b c
  2. 0 1 0 0
  3. 1 1 1 0
  4. 2 1 0 1

索引

  1. s=pd.Series(np.arange(5),np.arange(5)[::-1],dtype='int64')
  1. s
  1. 4 0
  2. 3 1
  3. 2 2
  4. 1 3
  5. 0 4
  6. dtype: int64
  1. print(s[s>2])
  1. 1 3
  2. 0 4
  3. dtype: int64
  1. # isin查询索引在某个范围
  2. print(s.isin([1,3,4]))
  1. 4 False
  2. 3 True
  3. 2 False
  4. 1 True
  5. 0 True
  6. dtype: bool
  1. # 根据索引查询数据
  2. print(s[s.isin([1,3,4])])
  1. 3 1
  2. 1 3
  3. 0 4
  4. dtype: int64
  1. # 构造一个联合索引的数据
  2. s=pd.Series(np.arange(6),index=pd.MultiIndex.from_product([[1,2],['a','b','c']]))
  1. print(s)
  1. 1 a 0
  2. b 1
  3. c 2
  4. 2 a 3
  5. b 4
  6. c 5
  7. dtype: int64
  1. print(s.iloc[s.index.isin([(1,'b'),(2,'c')])])
  1. 1 b 1
  2. 2 c 5
  3. dtype: int64
  1. # 构造一个时间序列
  2. dates=pd.date_range('20200920',periods=8)
  3. print(dates)
  1. DatetimeIndex(['2020-09-20', '2020-09-21', '2020-09-22', '2020-09-23',
  2. '2020-09-24', '2020-09-25', '2020-09-26', '2020-09-27'],
  3. dtype='datetime64[ns]', freq='D')
  1. df=pd.DataFrame(np.random.randn(8,4),index=dates,columns=['A','B','C','D'])
  2. print(df)
  1. A B C D
  2. 2020-09-20 -1.218522 2.067088 0.015009 0.158780
  3. 2020-09-21 -0.546837 -0.601178 -0.894882 0.172037
  4. 2020-09-22 0.189848 -0.910520 0.196186 -0.073495
  5. 2020-09-23 -0.566892 0.899193 -0.450925 0.633253
  6. 2020-09-24 0.038838 1.577004 0.580927 0.609050
  7. 2020-09-25 1.562094 0.020813 -0.618859 -0.515212
  8. 2020-09-26 -1.333947 0.275765 0.139325 1.124207
  9. 2020-09-27 -1.271748 1.082302 1.036805 -1.041206
  1. # 查询A列数据
  2. print(df['A'])
  1. 2020-09-20 -1.218522
  2. 2020-09-21 -0.546837
  3. 2020-09-22 0.189848
  4. 2020-09-23 -0.566892
  5. 2020-09-24 0.038838
  6. 2020-09-25 1.562094
  7. 2020-09-26 -1.333947
  8. 2020-09-27 -1.271748
  9. Freq: D, Name: A, dtype: float64
  1. # 查询小于0的数字,大于0的值默认被置为NaN
  2. df.where(df<0)
A B C D
2020-09-20 -1.218522 NaN NaN NaN
2020-09-21 -0.546837 -0.601178 -0.894882 NaN
2020-09-22 NaN -0.910520 NaN -0.073495
2020-09-23 -0.566892 NaN -0.450925 NaN
2020-09-24 NaN NaN NaN NaN
2020-09-25 NaN NaN -0.618859 -0.515212
2020-09-26 -1.333947 NaN NaN NaN
2020-09-27 -1.271748 NaN NaN -1.041206
  1. # 查询小于0的数字,大于0的值变成负数
  2. print(df.where(df<0,-df))
  1. A B C D
  2. 2020-09-20 -1.218522 -2.067088 -0.015009 -0.158780
  3. 2020-09-21 -0.546837 -0.601178 -0.894882 -0.172037
  4. 2020-09-22 -0.189848 -0.910520 -0.196186 -0.073495
  5. 2020-09-23 -0.566892 -0.899193 -0.450925 -0.633253
  6. 2020-09-24 -0.038838 -1.577004 -0.580927 -0.609050
  7. 2020-09-25 -1.562094 -0.020813 -0.618859 -0.515212
  8. 2020-09-26 -1.333947 -0.275765 -0.139325 -1.124207
  9. 2020-09-27 -1.271748 -1.082302 -1.036805 -1.041206
  1. # 查询小于0的数据,大于0的置为1000
  2. print(df.where(df<0,1000))
  1. A B C D
  2. 2020-09-20 -1.218522 1000.000000 1000.000000 1000.000000
  3. 2020-09-21 -0.546837 -0.601178 -0.894882 1000.000000
  4. 2020-09-22 1000.000000 -0.910520 1000.000000 -0.073495
  5. 2020-09-23 -0.566892 1000.000000 -0.450925 1000.000000
  6. 2020-09-24 1000.000000 1000.000000 1000.000000 1000.000000
  7. 2020-09-25 1000.000000 1000.000000 -0.618859 -0.515212
  8. 2020-09-26 -1.333947 1000.000000 1000.000000 1000.000000
  9. 2020-09-27 -1.271748 1000.000000 1000.000000 -1.041206
  1. # 构造一个10行3列的数据
  2. df=pd.DataFrame(np.random.randn(10,3),columns=list('abc'))
  3. print(df)
  1. a b c
  2. 0 1.761415 0.528009 -0.347271
  3. 1 -0.682149 0.353312 0.337229
  4. 2 1.080733 -0.272290 1.020335
  5. 3 -0.979681 -1.753745 0.836387
  6. 4 0.243748 2.085531 -0.993318
  7. 5 -1.041006 1.518130 -0.087383
  8. 6 -1.400354 -0.095196 3.043639
  9. 7 -0.835144 0.926415 -1.217102
  10. 8 0.326098 1.079906 0.156884
  11. 9 1.836618 -1.288516 -2.492620
  1. # 查询a>b的数据
  2. print(df.query('a>b'))
  1. a b c
  2. 0 1.761415 0.528009 -0.347271
  3. 2 1.080733 -0.272290 1.020335
  4. 3 -0.979681 -1.753745 0.836387
  5. 9 1.836618 -1.288516 -2.492620
  1. # 查询c>b>a的数据
  2. print(df.query('(c<b) & (b<a)'))
  1. a b c
  2. 0 1.761415 0.528009 -0.347271
  3. 9 1.836618 -1.288516 -2.492620

返回笔记列表
入门小站