Pandas入门教程(六)

  1. import pandas as pd
  2. gl=pd.read_csv('./pandas/data/game_logs.csv')
  1. # 数据的内存使用情况
  2. gl.info(memory_usage='deep')
  1. <class 'pandas.core.frame.DataFrame'>
  2. RangeIndex: 171907 entries, 0 to 171906
  3. Columns: 161 entries, date to acquisition_info
  4. dtypes: float64(77), int64(6), object(78)
  5. memory usage: 859.4 MB
  1. for dtype in ['float64','object','int64']:
  2. selected_dtype=gl.select_dtypes(include=[dtype])
  3. memory_usage_b=selected_dtype.memory_usage(deep=True).mean()
  4. memory_usage_mb=memory_usage_b/1024/1024
  5. print('[%s] memory usage %0.2f MB' % (dtype,memory_usage_mb))
  1. [float64] memory usage 1.29 MB
  2. [object] memory usage 9.50 MB
  3. [int64] memory usage 1.12 MB
  1. # uint8 int8 int16 int32 int64的取值范围
  2. import numpy as np
  3. for dtype in ['uint8','int8','int16','int32','int64']:
  4. print(np.iinfo(dtype))
  1. Machine parameters for uint8
  2. ---------------------------------------------------------------
  3. min = 0
  4. max = 255
  5. ---------------------------------------------------------------
  6. Machine parameters for int8
  7. ---------------------------------------------------------------
  8. min = -128
  9. max = 127
  10. ---------------------------------------------------------------
  11. Machine parameters for int16
  12. ---------------------------------------------------------------
  13. min = -32768
  14. max = 32767
  15. ---------------------------------------------------------------
  16. Machine parameters for int32
  17. ---------------------------------------------------------------
  18. min = -2147483648
  19. max = 2147483647
  20. ---------------------------------------------------------------
  21. Machine parameters for int64
  22. ---------------------------------------------------------------
  23. min = -9223372036854775808
  24. max = 9223372036854775807
  25. ---------------------------------------------------------------
  1. # 类型转换后的数据占用内存
  2. def mem_usage(data):
  3. if isinstance(data,pd.DataFrame):
  4. mem_b=data.memory_usage(deep=True).sum()
  5. else:
  6. mem_b=data.memory_usage(deep=True)
  7. return "{:03.2f} MB".format(mem_b/1024**2)
  8. gl_int64=gl.select_dtypes(include=['int64'])
  9. # 向下类型转换
  10. gl_int32=gl_int.apply(pd.to_numeric,downcast='unsigned')
  11. print(mem_usage(gl_int64))
  12. print(mem_usage(gl_int32))
  13. # float64 转 float
  14. gl_float64=gl.select_dtypes(include=['float64'])
  15. gl_float=gl_float64.apply(pd.to_numeric,downcast='float')
  16. print("转换前:"+mem_usage(gl_float64))
  17. print("转换后"+mem_usage(gl_float))
  1. 7.87 MB
  2. 1.48 MB
  3. 转换前:100.99 MB
  4. 转换后50.49 MB
  1. opt_gl=gl.copy()
  2. opt_gl[gl_int32.columns]=gl_int32
  3. opt_gl[gl_float.columns]=gl_float
  4. print("原数据的大小:"+mem_usage(gl))
  5. print("转换后的数据大小:"+mem_usage(opt_gl))
  1. 原数据的大小:859.43 MB
  2. 转换后的数据大小:802.54 MB
  1. gl_obj=gl.select_dtypes(include=['object']).copy()
  2. print(gl_obj.describe())
  1. day_of_week v_name v_league h_name h_league day_night \
  2. count 171907 171907 171907 171907 171907 140150
  3. unique 7 148 7 148 7 2
  4. top Sat CHN NL CHN NL D
  5. freq 28891 8870 88866 9024 88867 82724
  6. completion forefeit protest park_id ... h_player_6_id \
  7. count 116 145 180 171907 ... 140838
  8. unique 116 3 5 245 ... 4774
  9. top 19590602,PIT06,2,1,39 H V STL07 ... grimc101
  10. freq 1 69 90 7022 ... 427
  11. h_player_6_name h_player_7_id h_player_7_name h_player_8_id \
  12. count 140838 140838 140838 140838
  13. unique 4720 5253 5197 4760
  14. top Charlie Grimm grimc101 Charlie Grimm lopea102
  15. freq 427 491 491 676
  16. h_player_8_name h_player_9_id h_player_9_name additional_info \
  17. count 140838 140838 140838 1456
  18. unique 4710 5193 5142 332
  19. top Al Lopez spahw101 Warren Spahn HTBF
  20. freq 676 339 339 1112
  21. acquisition_info
  22. count 140841
  23. unique 1
  24. top Y
  25. freq 140841
  26. [4 rows x 78 columns]
  1. dow=gl_obj.day_of_week
  2. print(dow.head())
  1. dow_cat=dow.astype('category')
  2. print(dow_cat.head())
  1. print("转换前"+mem_usage(dow))
  2. print("转换后"+mem_usage(dow_cat))
  1. # 将重复比较多的数据转换成category,缩小数据内存
  2. convert_obj=pd.DataFrame()
  3. for col in gl_obj.columns:
  4. num_unique=len(gl_obj[col].unique())
  5. num_total=len(gl_obj[col])
  6. if num_unique/num_total<0.5:
  7. convert_obj.loc[:,col]=gl_obj[col].astype('category')
  8. else:
  9. convert_obj.loc[:,col]=gl_obj[col]
  10. print('数据转换前:'+mem_usage(gl_obj))
  11. print('数据转换后:'+mem_usage(convert_obj))
  1. opt_gl[convert_obj.columns]=convert_obj
  2. print(mem_usage(opt_gl))
  1. # apply操作
  1. titanic=pd.read_csv('./pandas/data/titanic_train.csv')
  2. titanic.iloc[99]
  1. # 获取99行的数据
  2. def get_row(data):
  3. return data.iloc[99]
  4. row=titanic.apply(get_row)
  5. row
  1. # 统计每一列为NaN的数量
  2. def get_null_count(data):
  3. col_null=pd.isnull(data)
  4. null=data[col_null]
  5. return len(null)
  6. null_count=titanic.apply(get_null_count)
  7. print(null_count)
  1. # 数据转换
  2. def which_class(row):
  3. pclass=row['Pclass']
  4. if pd.isnull(pclass):
  5. return "UnKown"
  6. elif pclass == 1:
  7. return "One"
  8. elif pclass == 2:
  9. return "Tow"
  10. elif pclass == 3:
  11. return "Three"
  12. classes=titanic.apply(which_class,axis=1)
  13. print(classes)
  1. # 找出未成年的数据
  2. def is_minor(row):
  3. age=row['Age']
  4. if age<18:
  5. return True
  6. else:
  7. return False
  8. minor=titanic.apply(is_minor,axis=1)
  9. print(titanic[minor])

返回笔记列表
入门小站