pandas 处理数据表常用公式

136 阅读 0 评论 90 点赞

我是靠谱客的博主无聊外套，这篇文章主要介绍pandas 处理数据表常用公式，现在分享给大家，希望可以做个参考。

复制代码

# =============================================================================
# Pandas
# pandas is a fast, powerful, flexible and easy to use open source data analysis 
# and manipulation tool, built on top of the Python programming language.
# 在Pandas中有两种主要的数据结构，Series & DataFrame
# Series 可以理解为一维数组，与一维数组主要区别为Series具有索引（index)
# DataFrame可以理解为二维结构的tabular, 类比excel中的一张表
# =============================================================================
import pandas as pd

# The DataFrame is one of Pandas' most important data structures. 
# It's basically a way to store tabular data where you can label the rows 
# and the columns. One way to build a DataFrame is from a dictionary.
test = {'a':[1,2,3,4,5],'b':[9,8,7,6,5]}
test_df1 = pd.DataFrame(test)
test_df2 = pd.DataFrame.from_dict(test, orient = 'columns')

# Note: if you pass an scalar to this method, there will be an error
europe = {'spain': 'madrid', 'france': 'paris', 'germany': 'berlin', 'norway': 'oslo'}
europe_df = pd.DataFrame(europe) # ValueError: If using all scalar values, you must pass an index 
europe_df = pd.DataFrame(europe,index=[0])

# =============================================================================
# candy_crush 为一款三消游戏， 此数据集中记录了一周的玩家游戏记录
# 游戏可以选择关卡，可以重复玩同一个关卡
# 数据下载链接见文末
# =============================================================================
candy_crush = pd.read_csv(r'C:Userszhou.c.15Downloadscandy_crush.csv')
print(candy_crush.info())
print(candy_crush.head())
# =============================================================================
# Subset and Slicing
# =============================================================================
# 前50行数据
print(candy_crush[0:50])
# 查看列名
print(candy_crush.columns)
# 更改列名的两种方式：
# 批量更改, 写出所有列名： candy_crush.columns = new_columns 
# 指定更改, 修改指定列名： candy_crush.rename(columns = {'player_id': 'playerid'}, inplace = True)
# 这里inplace 默认为False, 需设置为True才能修改成功，否则原dataframe的列名不会被修改

# 选择部分列的时候需要用双括号[[]], 返回结果为dataframe, 如果用单括号返回结果为Series
type(candy_crush[['player_id', 'dt']])
type(candy_crush[['player_id']])
type(candy_crush['player_id'])

# 数据定位
# 当需要修改表中部分数据的时候，不可以用slice的方式， 可以使用.loc/.iloc的方式，结合filtering筛选出需要修改的行列,直接赋值修改内容
# iloc based on row/column index
candy_crush.iloc[0] # 选择第一行数据，返回数据类型为Series
candy_crush.iloc[[0]] # 选择第一行数据，返回数据类型为DataFrame
candy_crush.iloc[0:3, 0] # 选择第一列的前三行数据，返回数据类型为Series
candy_crush.iloc[:, [0]] # 选择第一列的前三列数据，返回数据类型为DataFrame
candy_crush.iloc[:, 0:3] # 选择前三列数据，当选择列数大于1的时候，无需加中括号，返回数据类型为DataFrame
 
# 如果DataFrame的index被修改为其他形式
candy_crush.set_index('player_id', inplace = True) #此时index被设置为列 player_id
candy_crush.iloc[0:3] # 不影响iloc使用

# loc based on row/column label(name)
candy_crush.loc[0] # KeyError
type(candy_crush.loc[:, 'dt']) #选择dt列数据，返回数据类型为Series
type(candy_crush.loc[:, ['dt']]) #选择dt列数据，返回数据类型为DataFrame
candy_crush.reset_index(drop = False, inplace = True) # Drop表示是否丢掉原index,如果为False，则原index会被设置为新的列
# reset its index:
candy_crush.loc[0] # 设置了新的index，返回第一行数据，Series

# =============================================================================
# Filtering
# =============================================================================
ftl = candy_crush.dt == '2014-01-01' # 返回 bool Series
candy_crush[ftl] # 返回 2014-01-01 的数据

# 当有多个filter 条件时， 使用 | & ~ 
multi_fil = (candy_crush.dt == '2014-01-01') & (candy_crush.num_success == 1)
candy_crush[multi_fil] # 2014-01-01 这天通关的数据
candy_crush[~multi_fil] # 除2014-01-01 这天之外的通关的数据
candy_crush.player_id[multi_fil] # 2014-01-01 这天通关的玩家
candy_crush.player_id[multi_fil].unique() # 玩家id有重复的，去重； 返回数据类型为numpy.ndarray
# note: unique() 为Series的attribute， 所以这里用
# candy_crush.player_id[multi_fil].unique() 或者 candy_crush['player_id'][multi_fil].unique() 都行
# 但 candy_crush[['player_id']][multi_fil].unique() 会报错
player_list = list(candy_crush.player_id[multi_fil].unique()) # 转为list数据类型

# =============================================================================
# 聚合与 Agg functions: 
# 当只有一个agg function的时候，可以用.groupby()[].aggfunc(), aggfunc 有 sum, mean, count...
# 多个agg function 计算时使用.groupby()[].agg(func1, func2, ...)
# =============================================================================
# group by & pivot_table
# group by 一般配合agg function 食用
# DAU 计算
DAU = candy_crush.groupby(['dt'])[['player_id']].agg('nunique')
# 关卡难分析
# 每个关卡在玩的玩家人数
byLevel_player = candy_crush.groupby(['level'])['player_id'].nunique()
byLevel = candy_crush.groupby(['level'])[['num_attempts', 'num_success']].agg(['sum', 'mean']) #要使用双括号
# 增加新的列， 使用.loc或者.iloc
byLevel.loc[:, 'User_avg_attemp#'] = byLevel['num_attempts']['sum']/byLevel_player
byLevel.loc[:, 'User_success_rate'] = byLevel['num_success']['sum']/byLevel['num_attempts']['sum']
# 'User_avg_attemp#'与avg 'num_attempts' 的差异， 前者为每天每个user 尝试的平均次数，后者为总体上user的平均尝试次数
# 'User_success_rate' is highly correlated with 'User_avg_attemp#'
import matplotlib.pyplot as plt
byLevel[['User_avg_attemp#', 'User_success_rate']].plot(x = 'User_avg_attemp#', y = 'User_success_rate', kind = "scatter")
plt.show()

# 歪个话题，记几个添加列的方法
# =============================================================================
# 1. 直接命名并指定value
# from datetime import datetime
# candy_crush['create_time'] = datetime.strftime(datetime.now(), "%Y-%m-%d %H:%M:%S")
# candy_crush.drop(['create_time'], axis = 1, inplace = True) # axis = 0 是删除行，设置inplace = True 对原数据做修改
# 2. insert 方法， 第一个参数指定插入列的位置，第二个指定列名，第三个指定插入的值
# candy_crush.insert(0,'ID', range(candy_crush.shape[0]))
# 3. 直接赋值
# candy_crus['new_columns'] = value
# 4. reindex 并指定fill_value; 不是常规用法，需要列出所有的列名（包括新增列名），并且fill_value会把原有列中的缺失值都替换掉
# candy_crush.reindex(columns = [], fill_value = )
# 5. concat 方法，用户横向表拼接，参见"Merge/concat/join tables"
# 6. iloc/locf方法
# =============================================================================

# =============================================================================
# pivot_table and melt functions
# =============================================================================
# pivot table必须有index
# 纯属为了pivot而pivot, 没什么分析的目的
pivot_candy = candy_crush.pivot_table(index = ['dt'], 
                                      columns = ['level'], values = ['num_attempts', 'num_success'],
                                      aggfunc = 'sum',fill_value = 0) 
# pivot之后column出现multi-index, 可用droplevel function drop multi-level
# pivot_candy = pivot_candy.droplevel(None, axis = 1)

# pd.melt(pd.DataFrame, ...)
# 或 使用 pd.DataFrame.melt(id_vars, value_vars, var_name, value_name)
# --id_vars:不需要被转换的列名。
# --value_vars:需要转换的列名，如果剩下的列全部都要转换，就不用写了。
# --var_name和value_name是自定义设置对应的列名。
# --col_level :如果列是MultiIndex，则使用此级别。
# 
unpivot_candy = pivot_candy.melt(var_name = ['Statue', 'level'] , value_name = 'number of times')

# =============================================================================
# Pandas: Data manipulation
# =============================================================================
left = pd.DataFrame(
    {
        "key1": ["K0", "K0", "K1", "K2"],
        "key2": ["K0", "K1", "K0", "K1"],
        "A": ["A0", "A1", "A2", "A3"],
        "B": ["B0", "B1", "B2", "B3"],
    }
)

right = pd.DataFrame(
    {
        "key3": ["K0", "K1", "K1", "K2"],
        "key4": ["K0", "K0", "K0", "K0"],
        "C": ["C0", "C1", "C2", "C3"],
        "D": ["D0", "D1", "D2", "D3"],
    }
)

# Merge/concat/join tables
# 1. merge function, 类似SQL 中的join 功能
# pd.merge(table1, table2) 和table1.merge(table2) 都可以
# 当key在两个表中同名，可以用 on = key_name, 当两表中的key不同名，用left_on 和right_on, 如果都不写默认使用同名的column 
# 作为merge的key. suffixes用来给列名加后缀，用于除了merge key 之外的重名的列
table = left.merge(right, how = 'left', left_on = ["key1", "key2"], right_on = ["key3", "key4"], 
           suffixes = ("_left", "_right"))
# 2. pd.concat([table12, table2, ...], axis = 0/1, keys = None, join = "outer" ...)
# concat 可以横向连接或纵向连接多张表，axis默认为0, 即纵向连接, 两张表没有ovelap的列默认用NaN填充
# 默认join方式为outer, 可选择"outer" 或"inner"
pd.concat([left, right])
pd.concat([left, right], axis = 1) # 一般pd.concat用于纵向连接，横向连接当前版本不可以设置join key
# 3. pd.DataFrame.join(pd.DataFrame, on=None, how='left', lsuffix=' ', rsuffix=' ', sort=False)
# 参数意义与merge基本一直，join 默认为left连接方式
# 用于无重复列名的两表基于行索引的按行拼接（横向连接），如果两表中有重复列名，可set lsuffix和 rsuffi参数
# 也可进行列索引的连接，df1.join(df2.set_index(key of df2), on='key of df1'), 但结果会drop df2 的index
left.join(right.set_index(["key3", "key4"]), on = ["key1", "key2"])
                                                   
# Count non-NA cells for each column or row.
# The values `None`, `NaN`, `NaT`, and optionally `numpy.inf` (depending
# on `pandas.options.mode.use_inf_as_na`) are considered NA.
table.count()
# Count NA cells for Customer ID
# pd.DataFrame.isna()
# pd.DataFrame.isnull()
table.isna().sum()

# Finding Missing Data
# Nan||None||NaT||Null
# Nan: Not a Number, NaN是numpypandas下的，不是Python原生的.
# None: None不同于空列表和空字符串，是一种单独的格式
# NaT: Not a Time, 该值可以存储在 datetime 数组中以指示未知或缺失的 datetime 值。
# NaT 该值可以存储在 datetime 数组中以指示未知或缺失的 datetime 值,返回一个 (NaT) datetime 非时间标量值.
import numpy as np
type(np.NaN)
type(None)
type(np.nan)

# =============================================================================
# 其他用法
# =============================================================================
# 查看数据类型
candy_crush.dtypes
# 或者使用info()
candy_crush.info()

# delete row/column
# candy_crush.drop([index/columns list])

# sort rows
# if you want to change the order of the rows. You can sort the rows by 
# passing a column name to .sort_values()
# candy_crush.sort_values('', inplace = True)

# drop duplicates
# pd.DataFrame.drop_duplicates()

# =============================================================================
# Write DataFrame into CSV file
# =============================================================================
# candy_crush.to_csv(dir)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
# =============================================================================
# Pandas
# pandas is a fast, powerful, flexible and easy to use open source data analysis 
# and manipulation tool, built on top of the Python programming language.
# 在Pandas中有两种主要的数据结构，Series & DataFrame
# Series 可以理解为一维数组，与一维数组主要区别为Series具有索引（index)
# DataFrame可以理解为二维结构的tabular, 类比excel中的一张表
# =============================================================================
import pandas as pd

# The DataFrame is one of Pandas' most important data structures. 
# It's basically a way to store tabular data where you can label the rows 
# and the columns. One way to build a DataFrame is from a dictionary.
test = {'a':[1,2,3,4,5],'b':[9,8,7,6,5]}
test_df1 = pd.DataFrame(test)
test_df2 = pd.DataFrame.from_dict(test, orient = 'columns')

# Note: if you pass an scalar to this method, there will be an error
europe = {'spain': 'madrid', 'france': 'paris', 'germany': 'berlin', 'norway': 'oslo'}
europe_df = pd.DataFrame(europe) # ValueError: If using all scalar values, you must pass an index 
europe_df = pd.DataFrame(europe,index=[0])

# =============================================================================
# candy_crush 为一款三消游戏， 此数据集中记录了一周的玩家游戏记录
# 游戏可以选择关卡，可以重复玩同一个关卡
# 数据下载链接见文末
# =============================================================================
candy_crush = pd.read_csv(r'C:Userszhou.c.15Downloadscandy_crush.csv')
print(candy_crush.info())
print(candy_crush.head())
# =============================================================================
# Subset and Slicing
# =============================================================================
# 前50行数据
print(candy_crush[0:50])
# 查看列名
print(candy_crush.columns)
# 更改列名的两种方式：
# 批量更改, 写出所有列名： candy_crush.columns = new_columns 
# 指定更改, 修改指定列名： candy_crush.rename(columns = {'player_id': 'playerid'}, inplace = True)
# 这里inplace 默认为False, 需设置为True才能修改成功，否则原dataframe的列名不会被修改

# 选择部分列的时候需要用双括号[[]], 返回结果为dataframe, 如果用单括号返回结果为Series
type(candy_crush[['player_id', 'dt']])
type(candy_crush[['player_id']])
type(candy_crush['player_id'])

# 数据定位
# 当需要修改表中部分数据的时候，不可以用slice的方式， 可以使用.loc/.iloc的方式，结合filtering筛选出需要修改的行列,直接赋值修改内容
# iloc based on row/column index
candy_crush.iloc[0] # 选择第一行数据，返回数据类型为Series
candy_crush.iloc[[0]] # 选择第一行数据，返回数据类型为DataFrame
candy_crush.iloc[0:3, 0] # 选择第一列的前三行数据，返回数据类型为Series
candy_crush.iloc[:, [0]] # 选择第一列的前三列数据，返回数据类型为DataFrame
candy_crush.iloc[:, 0:3] # 选择前三列数据，当选择列数大于1的时候，无需加中括号，返回数据类型为DataFrame
 
# 如果DataFrame的index被修改为其他形式
candy_crush.set_index('player_id', inplace = True) #此时index被设置为列 player_id
candy_crush.iloc[0:3] # 不影响iloc使用

# loc based on row/column label(name)
candy_crush.loc[0] # KeyError
type(candy_crush.loc[:, 'dt']) #选择dt列数据，返回数据类型为Series
type(candy_crush.loc[:, ['dt']]) #选择dt列数据，返回数据类型为DataFrame
candy_crush.reset_index(drop = False, inplace = True) # Drop表示是否丢掉原index,如果为False，则原index会被设置为新的列
# reset its index:
candy_crush.loc[0] # 设置了新的index，返回第一行数据，Series

# =============================================================================
# Filtering
# =============================================================================
ftl = candy_crush.dt == '2014-01-01' # 返回 bool Series
candy_crush[ftl] # 返回 2014-01-01 的数据

# 当有多个filter 条件时， 使用 | & ~ 
multi_fil = (candy_crush.dt == '2014-01-01') & (candy_crush.num_success == 1)
candy_crush[multi_fil] # 2014-01-01 这天通关的数据
candy_crush[~multi_fil] # 除2014-01-01 这天之外的通关的数据
candy_crush.player_id[multi_fil] # 2014-01-01 这天通关的玩家
candy_crush.player_id[multi_fil].unique() # 玩家id有重复的，去重； 返回数据类型为numpy.ndarray
# note: unique() 为Series的attribute， 所以这里用
# candy_crush.player_id[multi_fil].unique() 或者 candy_crush['player_id'][multi_fil].unique() 都行
# 但 candy_crush[['player_id']][multi_fil].unique() 会报错
player_list = list(candy_crush.player_id[multi_fil].unique()) # 转为list数据类型

# =============================================================================
# 聚合与 Agg functions: 
# 当只有一个agg function的时候，可以用.groupby()[].aggfunc(), aggfunc 有 sum, mean, count...
# 多个agg function 计算时使用.groupby()[].agg(func1, func2, ...)
# =============================================================================
# group by & pivot_table
# group by 一般配合agg function 食用
# DAU 计算
DAU = candy_crush.groupby(['dt'])[['player_id']].agg('nunique')
# 关卡难分析
# 每个关卡在玩的玩家人数
byLevel_player = candy_crush.groupby(['level'])['player_id'].nunique()
byLevel = candy_crush.groupby(['level'])[['num_attempts', 'num_success']].agg(['sum', 'mean']) #要使用双括号
# 增加新的列， 使用.loc或者.iloc
byLevel.loc[:, 'User_avg_attemp#'] = byLevel['num_attempts']['sum']/byLevel_player
byLevel.loc[:, 'User_success_rate'] = byLevel['num_success']['sum']/byLevel['num_attempts']['sum']
# 'User_avg_attemp#'与avg 'num_attempts' 的差异， 前者为每天每个user 尝试的平均次数，后者为总体上user的平均尝试次数
# 'User_success_rate' is highly correlated with 'User_avg_attemp#'
import matplotlib.pyplot as plt
byLevel[['User_avg_attemp#', 'User_success_rate']].plot(x = 'User_avg_attemp#', y = 'User_success_rate', kind = "scatter")
plt.show()

# 歪个话题，记几个添加列的方法
# =============================================================================
# 1. 直接命名并指定value
# from datetime import datetime
# candy_crush['create_time'] = datetime.strftime(datetime.now(), "%Y-%m-%d %H:%M:%S")
# candy_crush.drop(['create_time'], axis = 1, inplace = True) # axis = 0 是删除行，设置inplace = True 对原数据做修改
# 2. insert 方法， 第一个参数指定插入列的位置，第二个指定列名，第三个指定插入的值
# candy_crush.insert(0,'ID', range(candy_crush.shape[0]))
# 3. 直接赋值
# candy_crus['new_columns'] = value
# 4. reindex 并指定fill_value; 不是常规用法，需要列出所有的列名（包括新增列名），并且fill_value会把原有列中的缺失值都替换掉
# candy_crush.reindex(columns = [], fill_value = )
# 5. concat 方法，用户横向表拼接，参见"Merge/concat/join tables"
# 6. iloc/locf方法
# =============================================================================


# =============================================================================
# pivot_table and melt functions
# =============================================================================
# pivot table必须有index
# 纯属为了pivot而pivot, 没什么分析的目的
pivot_candy = candy_crush.pivot_table(index = ['dt'], 
                                      columns = ['level'], values = ['num_attempts', 'num_success'],
                                      aggfunc = 'sum',fill_value = 0) 
# pivot之后column出现multi-index, 可用droplevel function drop multi-level
# pivot_candy = pivot_candy.droplevel(None, axis = 1)  

# pd.melt(pd.DataFrame, ...)
# 或 使用 pd.DataFrame.melt(id_vars, value_vars, var_name, value_name)
# --id_vars:不需要被转换的列名。
# --value_vars:需要转换的列名，如果剩下的列全部都要转换，就不用写了。
# --var_name和value_name是自定义设置对应的列名。
# --col_level :如果列是MultiIndex，则使用此级别。
# 
unpivot_candy = pivot_candy.melt(var_name = ['Statue', 'level'] , value_name = 'number of times')


# =============================================================================
# Pandas: Data manipulation
# =============================================================================
left = pd.DataFrame(
    {
        "key1": ["K0", "K0", "K1", "K2"],
        "key2": ["K0", "K1", "K0", "K1"],
        "A": ["A0", "A1", "A2", "A3"],
        "B": ["B0", "B1", "B2", "B3"],
    }
)


right = pd.DataFrame(
    {
        "key3": ["K0", "K1", "K1", "K2"],
        "key4": ["K0", "K0", "K0", "K0"],
        "C": ["C0", "C1", "C2", "C3"],
        "D": ["D0", "D1", "D2", "D3"],
    }
)

# Merge/concat/join tables
# 1. merge function, 类似SQL 中的join 功能
# pd.merge(table1, table2) 和table1.merge(table2) 都可以
# 当key在两个表中同名，可以用 on = key_name, 当两表中的key不同名，用left_on 和right_on, 如果都不写默认使用同名的column 
# 作为merge的key. suffixes用来给列名加后缀，用于除了merge key 之外的重名的列
table = left.merge(right, how = 'left', left_on = ["key1", "key2"], right_on = ["key3", "key4"], 
           suffixes = ("_left", "_right"))
# 2. pd.concat([table12, table2, ...], axis = 0/1, keys = None, join = "outer" ...)
# concat 可以横向连接或纵向连接多张表，axis默认为0, 即纵向连接, 两张表没有ovelap的列默认用NaN填充
# 默认join方式为outer, 可选择"outer" 或"inner"
pd.concat([left, right])
pd.concat([left, right], axis = 1) # 一般pd.concat用于纵向连接，横向连接当前版本不可以设置join key
# 3. pd.DataFrame.join(pd.DataFrame, on=None, how='left', lsuffix=' ', rsuffix=' ', sort=False)
# 参数意义与merge基本一直，join 默认为left连接方式
# 用于无重复列名的两表基于行索引的按行拼接（横向连接），如果两表中有重复列名，可set lsuffix和 rsuffi参数
# 也可进行列索引的连接，df1.join(df2.set_index(key of df2), on='key of df1'), 但结果会drop df2 的index
left.join(right.set_index(["key3", "key4"]), on = ["key1", "key2"])
                                                   
# Count non-NA cells for each column or row.
# The values `None`, `NaN`, `NaT`, and optionally `numpy.inf` (depending
# on `pandas.options.mode.use_inf_as_na`) are considered NA.
table.count()
# Count NA cells for Customer ID
# pd.DataFrame.isna()
# pd.DataFrame.isnull()
table.isna().sum()

# Finding Missing Data
# Nan||None||NaT||Null
# Nan: Not a Number, NaN是numpypandas下的，不是Python原生的.
# None: None不同于空列表和空字符串，是一种单独的格式
# NaT: Not a Time, 该值可以存储在 datetime 数组中以指示未知或缺失的 datetime 值。
# NaT 该值可以存储在 datetime 数组中以指示未知或缺失的 datetime 值,返回一个 (NaT) datetime 非时间标量值.
import numpy as np
type(np.NaN)
type(None)
type(np.nan)

# =============================================================================
# 其他用法
# =============================================================================
# 查看数据类型
candy_crush.dtypes
# 或者使用info()
candy_crush.info()

# delete row/column
# candy_crush.drop([index/columns list])

# sort rows
# if you want to change the order of the rows. You can sort the rows by 
# passing a column name to .sort_values()
# candy_crush.sort_values('', inplace = True)

# drop duplicates
# pd.DataFrame.drop_duplicates()

# =============================================================================
# Write DataFrame into CSV file
# =============================================================================
# candy_crush.to_csv(dir)