数据处理
记录在处理数据阶段用到的代码块及其拓展。
主要使用pandas库
1. 文件路径
(1) os库
1
2
3
4
5
6
7
8
9
10
11
12import os os.getcwd() # 获取当前文件路径 os.path.dirname(os.getcwd()) # 获取上一级文件路径 os.path.join(os.path.dirname(os.getcwd()),'test') # 建立同级文件 # 先获取上级目录,然后通过join函数来把同级的目录test拼进来。 os.listdir() # 获取当前目录下所有文件名 path = 'F:/pics' os.path.dirname(path) # 输出:'F:/' os.listdir(path) # 输出:['abc.jpg'] os.path.join(os.path.dirname(path),'test') # 输出:'F:/test'
(2) 相对路径
1
2
3
4# / :表示当前路径的根路径。 # ./ :表示当前路径。 # ../ :表示父级路径,当前路径所在的上一级路径。
2. 文件读取
(1) 单个文件读取或输出
1
2
3
4
5
6import pandas as pd df1 = pd.read_csv("文件地址文件名称.csv",header=0,index_col=0,usecols=[1,2,3]) # 指定表头、序号列、使用的列 df2 = pd.read_excel("文件地址文件名称.xls/xlsx",sheetname=0,header=0,index_col=0) # 指定读取的sheet df1.to_csv("文件地址文件名称.csv") df2.to_excel("文件地址文件名称.xls")
(2) 批量读取文件处理框架:
一层嵌套
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16import os import pandas as pd read_path = '输入文件地址' output_path = '输出文件地址' def read_files(read_path,output_path): files = os.listdir(read_path) # 顺序读取read_path下的所有文件夹名称 for file in files: # 读取单个文件内容 df = pd.read_csv(read_path+"\"+file,header=0,index_col=0) # 文件处理 # 输出文件保存到指定路径下 df.to_csv(output_path + "\" + file) print("文件处理完毕")
两层嵌套
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32# 批量读取数据以及进行数据提取 def deal_files(): # 顺序读取read_path下的所有文件名称 files = os.listdir(read_path) # print(files) for file_name in files: # print(file_name) # 读取单个文件 print("正在处理:" + file_name) # 读取文件下面的文件 file = os.listdir(read_path+"\"+file_name) # print(file) # 创建一个存储目录 path = output_path + "\" + file_name os.mkdir(path) # 创建目录 for wj in file: # print(wj) df = pd.read_csv(read_path+"\"+file_name+'\'+wj,header=0,index_col=0) # 数据处理 ... # 输出文件保存到指定路径下 df.to_csv(path + "\"+ wj) print("ok") print("文件处理完毕")
(3) 把两个excel写入到一个文件中
1
2
3
4with pd.ExcelWriter('sheet.xlsx') as writer: df1.to_excel(writer, sheet_name='sheet1') df2.to_excel(writer, sheet_name='sheet2')
3. DataFrame
1
2# pandas.DataFrame(data=None, index=None, columns=None, dtype=None, copy=None)
(1) 创建
① 由字典创建
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39d = {'col1': ['A','B'], 'col2': [1,2]} pd.DataFrame(data=d) # 输出: col1 col2 0 A 1 1 B 2 pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'}, 'B': {0: 1, 1: 3, 2: 5}, 'C': {0: 2, 1: 4, 2: 6}}) # 输出: A B C 0 a 1 2 1 b 3 4 2 c 5 6 data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']} pd.DataFrame.from_dict(data) # 默认键名是列名 # 输出: col_1 col_2 0 3 a 1 2 b 2 1 c 3 0 d data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']} pd.DataFrame.from_dict(data, orient='index') # 键名作为index # 输出: 0 1 2 3 row_1 3 2 1 0 row_2 a b c d pd.DataFrame.from_dict(data, orient='index', columns=['A', 'B', 'C', 'D']) # 输出: A B C D row_1 3 2 1 0 row_2 a b c d
② 由包含Series的字典创建
1
2
3
4
5
6
7
8
9d = {'col1': [0, 1, 2, 3], 'col2': pd.Series([2, 3], index=[2, 3])} pd.DataFrame(data=d, index=[0, 1, 2, 3]) # 输出: col1 col2 0 0 NaN 1 1 NaN 2 2 2.0 3 3 3.0
③ 由numpy数组创建
1
2
3
4
5
6
7
8
9import numpy as np pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=['a', 'b', 'c']) # 输出: a b c 0 1 2 3 1 4 5 6 2 7 8 9
④ 直接创建
1
2
3
4
5
6pd.DataFrame([['a1','a2'],['b1','b2']], columns=['A','B']) # 输出: A B 0 a1 a2 1 b1 b2
(2) 数据选取
df.loc 最好用
1)行(列)选取(单维度选取):df[] 只能选取行或者列
df[ ] 可以用切片取多行或一行(dataframe);可以取一列(series)
df[[ ]] 可以取多列、一列(dataframe),不能用切片
2)区域选取(多维选取):df.loc[],df.iloc[]
df.loc[] 取单行或单列,Series
df.loc[:, ] 可切片取多列,可用list取多列
df.loc[ ,:] 可切片取多行,可用list取多行
df.loc[[ ]] 可以取单行、多行,Dataframe,用index不可以切片
可定位、可赋值
3)单元格选取(点选取):df.at[],df.iat[]
整数索引切片是前闭后开,标签索引切片是前闭后闭
一个[]取一个,是series
两个[]取一个,是dataframe,里面的[]是一个列表
参考
① df[] # 取列/行
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55df = pd.DataFrame( { "A": ["A0", "A1", "A2", "A3"], "B": ["B0", "B1", "B2", "B3"], "C": ["C0", "C1", "C2", "C3"], "D": ["D0", "D1", "D2", "D3"], }, index=['a', 'b', 'c', 'd'], ) df['A'] # Series,取列 # 输出: a A0 b A1 c A2 d A3 Name: A, dtype: object df.A # 同df['A'] # 输出: a A0 b A1 c A2 d A3 Name: A, dtype: object df[['A']] # DataFrame,取列 # 输出: A a A0 b A1 c A2 d A3 df[['A','C']] # # DataFrame,取多列 # 输出: A C a A0 C0 b A1 C1 c A2 C2 d A3 C3 df[0:2] # 取行,按序号前闭后开 # 输出: A B C D a A0 B0 C0 D0 b A1 B1 C1 D1 df['a':'c'] # 取行,按名称前闭后闭,index是int型不适用 # 输出: A B C D a A0 B0 C0 D0 b A1 B1 C1 D1 c A2 B2 C2 D2
② pandas.DataFrame.loc # 用名称
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75df = pd.DataFrame( { "A": ["A0", "A1", "A2", "A3"], "B": ["B0", "B1", "B2", "B3"], "C": ["C0", "C1", "C2", "C3"], "D": ["D0", "D1", "D2", "D3"], }, index=[0, 1, 2, 3], ) # df: A B C D 0 A0 B0 C0 D0 1 A1 B1 C1 D1 2 A2 B2 C2 D2 3 A3 B3 C3 D3 df.loc[0] # Series,取单行 # 输出: A A0 B B0 C C0 D D0 Name: 0, dtype: object df.loc[[0]] # DataFrame,取单行 # 输出: A B C D 0 A0 B0 C0 D0 df.loc[[0,2]] # DataFrame,取多行 # 输出: A B C D 0 A0 B0 C0 D0 2 A2 B2 C2 D2 df.loc[0,'A'] # 取值 # 输出: 'A0' df.loc[0,'A':'C'] # 取某行,切片 # 输出: A A0 B B0 C C0 Name: 0, dtype: object df.loc[:,'A':'B'] # 取某列,切片 # 输出: A B 0 A0 B0 1 A1 B1 2 A2 B2 3 A3 B3 df.loc[df['A'] == 'A0'] # 定位,也可以用>,< # 输出: A B C D 0 A0 B0 C0 D0 df.loc[[1],['A']] = 'a' # 赋值 # 输出: A B C D 0 A0 B0 C0 D0 1 a B0 C0 D0 2 A2 B2 C2 D2 3 A3 B3 C3 D3 df.loc[1,'B'] = 'B' # 赋值 # 输出: A B C D 0 A0 B0 C0 D0 1 a B C0 D0 2 A2 B2 C2 D2 3 A3 B3 C3 D3
③ pandas.DataFrame.iloc # 用整数
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15# 用法与pandas.DataFrame.loc 相同 df = pd.DataFrame( { "A": ["A0", "A1", "A2", "A3"], "B": ["B0", "B1", "B2", "B3"], "C": ["C0", "C1", "C2", "C3"], "D": ["D0", "D1", "D2", "D3"], }, index=[0, 1, 2, 3], ) df.iloc[3,2] # 取值 # 输出: 'C3'
④ pandas.DataFrame.at 和 pandas.DataFrame.iat 取单个值
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15df = pd.DataFrame( { "A": ["A0", "A1", "A2", "A3"], "B": ["B0", "B1", "B2", "B3"], "C": ["C0", "C1", "C2", "C3"], "D": ["D0", "D1", "D2", "D3"], }, index=[0, 1, 2, 3], ) df.at[0,'A'] df.iat[0,0] # 输出: 'A0' 'A0'
⑤ DataFrame.isin(values) 取特定值所在的行/列
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31''' DataFrame.isin(values) values: iterable, Series, DataFrame or dict ''' df = pd.DataFrame( { "A": ["A0", "A1", "A2", "A3"], "B": ["B0", "B1", "B2", "B3"], "C": ["C0", "C1", "C2", "C3"], "D": ["D0", "D1", "D2", "D3"], }, index=[0, 1, 2, 3], ) # 输出: A B C D 0 A0 B0 C0 D0 1 A1 B1 C1 D1 2 A2 B2 C2 D2 3 A3 B3 C3 D3 df[df['A'].isin(['A0'])] # 输出: A B C D 0 A0 B0 C0 D0 df[df['A'].isin(['A0','A3'])] # 输出: A B C D 0 A0 B0 C0 D0 3 A3 B3 C3 D3
已知特定行/列的序号或行名/列名→list,for循环读取,合并/存入新文件
⑥ DataFrame.filter() 根据索引标签划分子集
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28''' DataFrame.filter(items=None, like=None, regex=None, axis=None) ''' df = pd.DataFrame(np.array(([1, 2, 3], [4, 5, 6])), index=['mouse', 'rabbit'], columns=['one', 'two', 'three']) # df: one two three mouse 1 2 3 rabbit 4 5 6 df.filter(items=['one', 'three']) # select columns by name # 输出: one three mouse 1 3 rabbit 4 6 df.filter(regex='e$', axis=1) # select columns by regular expression # 输出: one three mouse 1 3 rabbit 4 6 df.filter(like='bbi', axis=0) # select rows containing 'bbi' # 输出: one two three rabbit 4 5 6
⑦ DataFrame.get() 选取列
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23''' DataFrame.get(key, default=None) Get item from object for given key (ex: DataFrame column). Returns default value if not found. ''' df = pd.DataFrame(np.array(([1, 2, 3], [4, 5, 6])), index=['mouse', 'rabbit'], columns=['one', 'two', 'three']) df.get(['one','two']) # 输出: one two mouse 1 2 rabbit 4 5 df.get('one') # 输出: mouse 1 rabbit 4 Name: one, dtype: int32 df.get(['four'],"没有") # 输出:'没有'
⑧ DataFrame.select_dtypes() 根据列的数据类型返回子集
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57''' DataFrame.select_dtypes(include=None, exclude=None) To select all numeric types, use np.number or 'number' To select strings you must use the object dtype, but note that this will return all object dtype columns To select datetimes, use np.datetime64, 'datetime' or 'datetime64' To select timedeltas, use np.timedelta64, 'timedelta' or 'timedelta64' To select Pandas categorical dtypes, use 'category' To select Pandas datetimetz dtypes, use 'datetimetz' (new in 0.20.0) or 'datetime64[ns, tz]' numpy dtype hierarchy:https://numpy.org/doc/stable/reference/arrays.scalars.html ''' df = pd.DataFrame({'a': [1, 2] * 3, 'b': [True, False] * 3, 'c': [1.0, 2.0] * 3}) df # 输出: a b c 0 1 True 1.0 1 2 False 2.0 2 1 True 1.0 3 2 False 2.0 4 1 True 1.0 5 2 False 2.0 df.dtypes # 输出: a int64 b bool c float64 dtype: object df.select_dtypes(include='bool') # 输出: b 0 True 1 False 2 True 3 False 4 True 5 False df.select_dtypes(exclude='bool') # 输出: a c 0 1 1.0 1 2 2.0 2 1 1.0 3 2 2.0 4 1 1.0 5 2 2.0
(3) 合并df
① pandas.DataFrame.join 列合并
无index时:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58''' DataFrame.join(other, on=None, how='left', lsuffix='', rsuffix='', sort=False) 加入另一个 DataFrame 的列;在索引或键列上将列与其他 DataFrame 连接。 ''' df1 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'], 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']}) df2 = pd.DataFrame({'key': ['K0', 'K1', 'K2'], 'B': ['B0', 'B1', 'B2']}) df1.join(df2) # 会报错 df1.join(df2, lsuffix='_1', rsuffix='_2') # 列加后缀 # 输出: key_1 A key_2 B 0 K0 A0 K0 B0 1 K1 A1 K1 B1 2 K2 A2 K2 B2 3 K3 A3 NaN NaN 4 K4 A4 NaN NaN 5 K5 A5 NaN NaN df1.set_index('key').join(df2.set_index('key')) # 使用键列连接 # 输出: A B key K0 A0 B0 K1 A1 B1 K2 A2 B2 K3 A3 NaN K4 A4 NaN K5 A5 NaN df1.join(df2.set_index('key'), on='key') # 根据相同列合并 # 输出: key A B 0 K0 A0 B0 1 K1 A1 B1 2 K2 A2 B2 3 K3 A3 NaN 4 K4 A4 NaN 5 K5 A5 NaN print(df2.set_index('key')) # 将某列设为index # 输出: key B K0 B0 K1 B1 K2 B2 df1 = pd.DataFrame({'字母': ['A','B'], '数字': [1,2]}) df2 = pd.DataFrame({'英文': ['one','two']}) df1.join(df2) # 输出: 字母 数字 英文 0 A 1 one 1 B 2 two
有index时:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43df1 = pd.DataFrame({'字母': ['A','B'], '数字': [1,2]}) df2 = pd.DataFrame({'英文': ['one','two']},index=['a','b']) df1.join(df2,how='right') # 使用哪个df的索引 # 输出: 字母 数字 英文 a NaN NaN one b NaN NaN two left = pd.DataFrame( {"A": ["A0", "A1", "A2"], "B": ["B0", "B1", "B2"]}, index=["K0", "K1", "K2"] ) right = pd.DataFrame( {"C": ["C0", "C2", "C3"], "D": ["D0", "D2", "D3"]}, index=["K0", "K2", "K3"] ) left.join(right) # 输出: A B C D K0 A0 B0 C0 D0 K1 A1 B1 NaN NaN K2 A2 B2 C2 D2 right.join(left) # 输出: C D A B K0 C0 D0 A0 B0 K2 C2 D2 A2 B2 K3 C3 D3 NaN NaN left.join(right, how="inner") # 输出: A B C D K0 A0 B0 C0 D0 K2 A2 B2 C2 D2 left.join(right, how="outer") # 输出: A B C D K0 A0 B0 C0 D0 K1 A1 B1 NaN NaN K2 A2 B2 C2 D2 K3 NaN NaN C3 D3
② pandas.DataFrame.merge 列合并
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93''' DataFrame.merge(right, how='inner', on=None, left_on=None, right_on=None, left_index=False, right_index=False, sort=False, suffixes=('_x', '_y'), copy=True, indicator=False, validate=None) how:{‘left’, ‘right’, ‘outer’, ‘inner’, ‘cross’}, default ‘inner’ 合并的类型 on:Column or index level names to join on, 两个df中都要有的列,默认是都有的列 left_on/right_on:左侧/右侧DataFrame中用于连接键的列名 suffixes:后缀 ''' df1 = pd.DataFrame({'akey': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'], 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']}) df2 = pd.DataFrame({'bkey': ['K0', 'K1', 'K2'], 'B': ['B0', 'B1', 'B2']}) df1.merge(df2) # 报错 df1.merge(df2, left_on='akey', right_on='bkey') # 输出: akey A bkey B 0 K0 A0 K0 B0 1 K1 A1 K1 B1 2 K2 A2 K2 B2 df1 = pd.DataFrame({'akey': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'], 'v': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']}) df2 = pd.DataFrame({'bkey': ['K0', 'K1', 'K2'], 'v': ['B0', 'B1', 'B2']}) df1.merge(df2, left_on='akey', right_on='bkey') # 默认添加后缀_x,_y # 输出: akey v_x bkey v_y 0 K0 A0 K0 B0 1 K1 A1 K1 B1 2 K2 A2 K2 B2 df1.merge(df2, left_on='akey', right_on='bkey',suffixes=('_left', '_right')) # 自己添加后缀形式 # 输出: akey v_left bkey v_right 0 K0 A0 K0 B0 1 K1 A1 K1 B1 2 K2 A2 K2 B2 df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]}) df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]}) df1.merge(df2, how='inner') # 只保留主键一致的行 # 输出: a b c 0 foo 1 3 df1.merge(df2, how='outer') # 全部保留,空值填充 # 输出: a b c 0 foo 1.0 3.0 1 bar 2.0 NaN 2 baz NaN 4.0 df1.merge(df2, how='left') # 左边b列是全的,右边c列有空值补全 # 输出: a b c 0 foo 1 3.0 1 bar 2 NaN df1.merge(df2, how='right') # 右边c列是全的,左边b列有空值补全 # 输出: a b c 0 foo 1.0 3 1 baz NaN 4 df1.merge(df2, how='cross') # 输出: a_x b a_y c 0 foo 1 foo 3 1 foo 1 baz 4 2 bar 2 foo 3 3 bar 2 baz 4 df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2], 'c': [3, 4]}) df2 = pd.DataFrame({'a': ['foo', 'baz'], 'b': [3, 4], 'c': [3, 4]}) df1.merge(df2, how='outer', on='a') # 输出: a b_x c_x b_y c_y 0 foo 1.0 3.0 3.0 3.0 1 bar 2.0 4.0 NaN NaN 2 baz NaN NaN 4.0 4.0 df1.merge(df2, how='outer', on='b') # 输出: a_x b c_x a_y c_y 0 foo 1 3.0 NaN NaN 1 bar 2 4.0 NaN NaN 2 NaN 3 NaN foo 3.0 3 NaN 4 NaN baz 4.0
参考:pandas.DataFrame.merge的用法
③ pandas.DataFrame.append # 行合并
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35''' DataFrame.append(other, ignore_index=False, verify_integrity=False, sort=False) ''' df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2], 'c': [3, 4]}) df2 = pd.DataFrame({'a': ['foo', 'baz'], 'b': [3, 4], 'c': [3, 4]}) df1.append(df2) # 输出: a b c 0 foo 1 3 1 bar 2 4 0 foo 3 3 1 baz 4 4 df1.append(df2, ignore_index=True) # 输出: a b c 0 foo 1 3 1 bar 2 4 2 foo 3 3 3 baz 4 4 # 新建一列,并填充数据 df = pd.DataFrame(columns=['A']) for i in range(5): df = df.append({'A': i}, ignore_index=True) # 输出: A 0 0 1 1 2 2 3 3 4 4
参考:merge,join and concat
④ pandas.merge 列合并
具体的就看参考链接吧~
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25''' pandas.merge(left, right, how='inner', on=None, left_on=None, right_on=None, left_index=False, right_index=False, sort=False, suffixes=('_x', '_y'), copy=True, indicator=False, validate=None) ''' left = pd.DataFrame( {"A": ["A0", "A1", "A2"], "B": ["B0", "B1", "B2"]}, index=["K0", "K1", "K2"] ) right = pd.DataFrame( {"C": ["C0", "C2", "C3"], "D": ["D0", "D2", "D3"]}, index=["K0", "K2", "K3"] ) pd.merge(left, right, left_index=True, right_index=True, how="outer") # 输出: A B C D K0 A0 B0 C0 D0 K1 A1 B1 NaN NaN K2 A2 B2 C2 D2 K3 NaN NaN C3 D3 # 和left.join(right, how="outer")的输出相同 pd.merge(left, right, left_index=True, right_index=True, how="inner") # 输出: A B C D K0 A0 B0 C0 D0 K2 A2 B2 C2 D2 # 和left.join(right, how="inner")的输出相同
⑤ pandas.concat 行列都可
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155''' pandas.concat(objs, axis=0, join='outer', ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, sort=False, copy=True) pandas.concat(objs, # 合并对象 axis=0, # 合并方向,默认是0纵轴方向 join='outer', # 合并取的是交集inner还是并集outer ignore_index=False, # 合并之后索引是否重新 keys=None, # 在行索引的方向上带上原来数据的名字;主要是用于层次化索引,可以是任意的列表或者数组、元组数据或者列表数组 levels=None, # 指定用作层次化索引各级别上的索引,如果是设置了keys names=None, # 行索引的名字,列表形式 verify_integrity=False, # 检查行索引是否重复;有则报错 sort=False, # 对非连接的轴进行排序 copy=True # 是否进行深拷贝 ) ''' df1 = pd.DataFrame( { "A": ["A0", "A1", "A2", "A3"], "B": ["B0", "B1", "B2", "B3"], "C": ["C0", "C1", "C2", "C3"], "D": ["D0", "D1", "D2", "D3"], }, index=[0, 1, 2, 3], ) df2 = pd.DataFrame( { "A": ["A4", "A5", "A6", "A7"], "B": ["B4", "B5", "B6", "B7"], "C": ["C4", "C5", "C6", "C7"], "D": ["D4", "D5", "D6", "D7"], }, index=[4, 5, 6, 7], ) df3 = pd.DataFrame( { "A": ["A8", "A9", "A10", "A11"], "B": ["B8", "B9", "B10", "B11"], "C": ["C8", "C9", "C10", "C11"], "D": ["D8", "D9", "D10", "D11"], }, index=[8, 9, 10, 11], ) pd.concat([df1, df2, df3]) # 输出: A B C D 0 A0 B0 C0 D0 1 A1 B1 C1 D1 2 A2 B2 C2 D2 3 A3 B3 C3 D3 4 A4 B4 C4 D4 5 A5 B5 C5 D5 6 A6 B6 C6 D6 7 A7 B7 C7 D7 8 A8 B8 C8 D8 9 A9 B9 C9 D9 10 A10 B10 C10 D10 11 A11 B11 C11 D11 result = pd.concat([df1, df2, df3], keys=["x", "y", "z"]) # 输出: A B C D x 0 A0 B0 C0 D0 1 A1 B1 C1 D1 2 A2 B2 C2 D2 3 A3 B3 C3 D3 y 4 A4 B4 C4 D4 5 A5 B5 C5 D5 6 A6 B6 C6 D6 7 A7 B7 C7 D7 z 8 A8 B8 C8 D8 9 A9 B9 C9 D9 10 A10 B10 C10 D10 11 A11 B11 C11 D11 result.loc["y"] # 输出: A B C D 4 A4 B4 C4 D4 5 A5 B5 C5 D5 6 A6 B6 C6 D6 7 A7 B7 C7 D7 df1 = pd.DataFrame( { "A": ["A0", "A1", "A2", "A3"], "B": ["B0", "B1", "B2", "B3"], "C": ["C0", "C1", "C2", "C3"], "D": ["D0", "D1", "D2", "D3"], }, index=[0, 1, 2, 3], ) df2 = pd.DataFrame( { "B": ["B2", "B3", "B6", "B7"], "D": ["D2", "D3", "D6", "D7"], "F": ["F2", "F3", "F6", "F7"], }, index=[2, 3, 6, 7], ) pd.concat([df1, df2], axis=1).reindex(df1.index) # 使用df1的index # 输出: A B C D B D F 0 A0 B0 C0 D0 NaN NaN NaN 1 A1 B1 C1 D1 NaN NaN NaN 2 A2 B2 C2 D2 B2 D2 F2 3 A3 B3 C3 D3 B3 D3 F3 df1 = pd.DataFrame( { "A": ["A0", "A1", "A2", "A3"], "B": ["B0", "B1", "B2", "B3"], "C": ["C0", "C1", "C2", "C3"], "D": ["D0", "D1", "D2", "D3"], } ) df2 = pd.DataFrame( { "A": ["A4", "A5", "A6", "A7"], "B": ["B4", "B5", "B6", "B7"], "C": ["C4", "C5", "C6", "C7"], "D": ["D4", "D5", "D6", "D7"], } ) df3 = pd.DataFrame( { "A": ["A8", "A9", "A10", "A11"], "B": ["B8", "B9", "B10", "B11"], "C": ["C8", "C9", "C10", "C11"], "D": ["D8", "D9", "D10", "D11"], } ) pd.concat([df1, df2, df3],ignore_index=True) # 输出: A B C D 0 A0 B0 C0 D0 1 A1 B1 C1 D1 2 A2 B2 C2 D2 3 A3 B3 C3 D3 4 A4 B4 C4 D4 5 A5 B5 C5 D5 6 A6 B6 C6 D6 7 A7 B7 C7 D7 8 A8 B8 C8 D8 9 A9 B9 C9 D9 10 A10 B10 C10 D10 11 A11 B11 C11 D11 pd.concat([df1, df2, df3],axis=1) # 输出: A B C D A B C D A B C D 0 A0 B0 C0 D0 A4 B4 C4 D4 A8 B8 C8 D8 1 A1 B1 C1 D1 A5 B5 C5 D5 A9 B9 C9 D9 2 A2 B2 C2 D2 A6 B6 C6 D6 A10 B10 C10 D10 3 A3 B3 C3 D3 A7 B7 C7 D7 A11 B11 C11 D11
(4) 插入列
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30''' DataFrame.insert(loc, column, value, allow_duplicates=False) 在特定位置插入列 ''' df = pd.DataFrame( { "A": ["A0", "A1", "A2", "A3"], "B": ["B0", "B1", "B2", "B3"], "C": ["C0", "C1", "C2", "C3"], "D": ["D0", "D1", "D2", "D3"], "E": ["A0", "A1", "A2", "A3"] } ) df.insert(0,'new_A',[1,0,0, 100]) # df: new_A A B C D E 0 1 A0 B0 C0 D0 A0 1 0 A1 B1 C1 D1 A1 2 0 A2 B2 C2 D2 A2 3 100 A3 B3 C3 D3 A3 df.insert(3,'A',[1,0,0, 100],allow_duplicates=True) # df: new_A A B A C D E 0 1 A0 B0 1 C0 D0 A0 1 0 A1 B1 0 C1 D1 A1 2 0 A2 B2 0 C2 D2 A2 3 100 A3 B3 100 C3 D3 A3
(5) 列分割
1
2
3
4
5
6
7
8
9
10
11
12
13df = pd.DataFrame([['2015-02-04 04:05:00'],['2015-02-04 04:05:05'],['2015-02-04 04:05:10']],columns=['time']) # df: time 0 2015-02-04 04:05:00 1 2015-02-04 04:05:05 2 2015-02-04 04:05:10 t = df['time'].str.split(' ',expand=True) # t: 0 1 0 2015-02-04 04:05:00 1 2015-02-04 04:05:05 2 2015-02-04 04:05:10
适用于时间、带单位的数据分割
(6) 删除行/列
1
2
3
4
5
6
7
8
9''' DataFrame.drop(labels=None, axis=0, index=None, columns=None, level=None, inplace=False, errors='raise') DataFrame.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False) # 删除重复行 DataFrame.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False) # 删除缺失值 '''
① DataFrame.drop()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38import numpy as np import pandas as pd df = pd.DataFrame(np.arange(12).reshape(3, 4), columns=['A', 'B', 'C', 'D']) # df: A B C D 0 0 1 2 3 1 4 5 6 7 2 8 9 10 11 # 删除列 df.drop(['B','C'],axis=1) # 输出: A D 0 0 3 1 4 7 2 8 11 df.drop(columns=['B','C']) # 输出: A D 0 0 3 1 4 7 2 8 11 # 删除行 df.drop([1,2],axis=0) # 输出: A B C D 0 0 1 2 3 # 删除行列 df.drop(index=[2],columns=['B','C']) # 输出: A D 0 0 3 1 4 7
② DataFrame.drop_duplicates() 只能删除重复行,利用转置删除重复列
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78''' DataFrame.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False) Parameters subset:column label or sequence of labels, optional 用于判断重复项的列 keep:{‘first’, ‘last’, False}, default ‘first’ 保留哪一项 - first : Drop duplicates except for the first occurrence. - last : Drop duplicates except for the last occurrence. - False : Drop all duplicates. inplace:bool, default False 是否返回copy:Whether to drop duplicates in place or to return a copy. ignore_index:bool, default False index重排 ''' df = pd.DataFrame( [["A0", "A1", "A2", "A3"],["B0", "B1", "B2", "B3"],["C0", "C1", "C2", "C3"], ["D0", "D1", "D2", "D3"],["A0", "A1", "A2", "A3"]], columns=[0, 1, 2, 3] ) # df: 0 1 2 3 0 A0 A1 A2 A3 1 B0 B1 B2 B3 2 C0 C1 C2 C3 3 D0 D1 D2 D3 4 A0 A1 A2 A3 df.drop_duplicates() # 输出: 0 1 2 3 0 A0 A1 A2 A3 1 B0 B1 B2 B3 2 C0 C1 C2 C3 3 D0 D1 D2 D3 # 可以利用转置,删除重复列 df = pd.DataFrame( { "A": ["A0", "A1", "A2", "A3"], "B": ["B0", "B1", "B2", "B3"], "C": ["C0", "C1", "C2", "C3"], "D": ["D0", "D1", "D2", "D3"], "E": ["A0", "A1", "A2", "A3"] } ) # df: A B C D E 0 A0 B0 C0 D0 A0 1 A1 B1 C1 D1 A1 2 A2 B2 C2 D2 A2 3 A3 B3 C3 D3 A3 df_new = df.T # df_new: 0 1 2 3 A A0 A1 A2 A3 B B0 B1 B2 B3 C C0 C1 C2 C3 D D0 D1 D2 D3 E A0 A1 A2 A3 df1 = df_new.drop_duplicates() # df1: 0 1 2 3 A A0 A1 A2 A3 B B0 B1 B2 B3 C C0 C1 C2 C3 D D0 D1 D2 D3 df2 = df1.T # df2: A B C D 0 A0 B0 C0 D0 1 A1 B1 C1 D1 2 A2 B2 C2 D2 3 A3 B3 C3 D3
③ DataFrame.pop() 使用列名,只能删除一列
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30''' DataFrame.pop(item) item: label Label of column to be popped. # 使用列名,只能删除一列 返回:删除的列Series ''' df = pd.DataFrame( { "A": ["A0", "A1", "A2", "A3"], "B": ["B0", "B1", "B2", "B3"], "C": ["C0", "C1", "C2", "C3"], "D": ["D0", "D1", "D2", "D3"], "E": ["A0", "A1", "A2", "A3"] } ) df.pop('A') # 输出: 0 A0 1 A1 2 A2 3 A3 Name: A, dtype: object df # df: B C D E 0 B0 C0 D0 A0 1 B1 C1 D1 A1 2 B2 C2 D2 A2 3 B3 C3 D3 A3
④ delete 删除列
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17df = pd.DataFrame( { "A": ["A0", "A1", "A2", "A3"], "B": ["B0", "B1", "B2", "B3"], "C": ["C0", "C1", "C2", "C3"], "D": ["D0", "D1", "D2", "D3"], "E": ["A0", "A1", "A2", "A3"] } ) del df['A'] # 输出: B C D E 0 B0 C0 D0 A0 1 B1 C1 D1 A1 2 B2 C2 D2 A2 3 B3 C3 D3 A3
⑤ isin() 删除含有特定值的行/列
1
2
3df[df['有效'].isin(['否'])] # 取df中‘有效’列为‘否’的行 df[-df['有效'].isin(['否'])] # 取df中‘有效’列不为‘否’的行,取反
(7) 缺失值处理
① DataFrame.isna()和DataFrame.isnull() 缺失值检测
NA values, such as None or numpy.NaN
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33# ① DataFrame.isna() # 空值检测 # NA values, such as None or numpy.NaN df = pd.DataFrame(dict(age=[5, 6, np.NaN], born=[pd.NaT, pd.Timestamp('1939-05-27'), pd.Timestamp('1940-04-25')], name=['Alfred', 'Batman', ''], toy=[None, 'Batmobile', 'Joker'])) df # df: age born name toy 0 5.0 NaT Alfred None 1 6.0 1939-05-27 Batman Batmobile 2 NaN 1940-04-25 Joker df.isna() # 输出: age born name toy 0 False True False True 1 False False False False 2 True False False False # DataFrame.isnull() .isna()的别名 # 非缺失值值检测 DataFrame.notna() DataFrame.notnull() df[df['XX'].isnull()] # 筛选某列中的空值 df[~df['XX'].isnull()] # 筛选某列中的非空值 df[df['XX'].notnull()] # 筛选非空值 df.dropna(subset=['XX']) # 筛选非空值
含空值相关的数据筛选
② DataFrame.dropna() 删除缺失值
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19# ② DataFrame.dropna() # 删除缺失值 ''' DataFrame.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False) axis:{0 or ‘index’, 1 or ‘columns’}, default 0 0, or ‘index’ : Drop rows which contain missing values. 1, or ‘columns’ : Drop columns which contain missing value. how:{‘any’, ‘all’}, default ‘any’ Determine if row or column is removed from DataFrame, when we have at least one NA or all NA. ‘any’ : If any NA values are present, drop that row or column. ‘all’ : If all values are NA, drop that row or column. thresh:int, optional Require that many non-NA values. subset:column label or sequence of labels, optional Labels along other axis to consider, e.g. if you are dropping rows these would be a list of columns to include. inplace:bool, default False If True, do operation inplace and return None. ''' df.dropna(axis=0, how='any', subset=None, inplace=True)
③ DataFrame.fillna() 缺失值填充
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70''' DataFrame.fillna(value=None, method=None, axis=None, inplace=False, limit=None, downcast=None) method: {‘backfill’, ‘bfill’, ‘pad’, ‘ffill’, None}, default None pad / ffill: propagate last valid observation forward to next valid backfill / bfill: use next valid observation to fill gap. ''' df = pd.DataFrame([[np.nan, 2, np.nan, 0], [3, 4, np.nan, 1], [np.nan, np.nan, np.nan, np.nan], [np.nan, 3, np.nan, 4]], columns=list("ABCD")) df # 输出: A B C D 0 NaN 2.0 NaN 0.0 1 3.0 4.0 NaN 1.0 2 NaN NaN NaN NaN 3 NaN 3.0 NaN 4.0 df.fillna(0) # 输出: A B C D 0 0.0 2.0 0.0 0.0 1 3.0 4.0 0.0 1.0 2 0.0 0.0 0.0 0.0 3 0.0 3.0 0.0 4.0 df.fillna(method="ffill") # 用前一个值填充 # 输出: A B C D 0 NaN 2.0 NaN 0.0 1 3.0 4.0 NaN 1.0 2 3.0 4.0 NaN 1.0 3 3.0 3.0 NaN 4.0 df.fillna(method="bfill") # 用后一个值填充 # 输出: A B C D 0 3.0 2.0 NaN 0.0 1 3.0 4.0 NaN 1.0 2 NaN 3.0 NaN 4.0 3 NaN 3.0 NaN 4.0 values = {"A": 0, "B": 1, "C": 2, "D": 3} df.fillna(value=values) # 用给定的值填充 # 输出: A B C D 0 0.0 2.0 2.0 0.0 1 3.0 4.0 2.0 1.0 2 0.0 1.0 2.0 3.0 3 0.0 3.0 2.0 4.0 df.fillna(value=values, limit=1) # 仅填充第一个 # 输出: A B C D 0 0.0 2.0 2.0 0.0 1 3.0 4.0 NaN 1.0 2 NaN 1.0 NaN 3.0 3 NaN 3.0 NaN 4.0 df2 = pd.DataFrame(np.zeros((4, 4)), columns=list("ABCE")) # df2: A B C E 0 0.0 0.0 0.0 0.0 1 0.0 0.0 0.0 0.0 2 0.0 0.0 0.0 0.0 3 0.0 0.0 0.0 0.0 df.fillna(df2) # 使用表填充,填充相同的index和columns
④ nan的判断
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21import numpy as np type(np.nan) # float a = np.nan print(a==np.nan) # False np.isnan(a) # True # 如果nan类型为<class 'numpy.float64'> # 判断:math.isnan() df.at[i,'Amap_loc'] = 'XX' # 字符串型 np.isnan(df.at[i,'Amap_loc']) # 报错: TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe'' # 修改: pd.isna(df.at[i,'Amap_loc'])
(8) 数据类型转换
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44''' pandas.DataFrame.dtypes # 输出数据类型 pandas.DataFrame.select_dtypes(include=None, exclude=None) # 根据列的数据类型返回子集 pandas.to_numeric(arg, errors='raise', downcast=None) pandas.to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, utc=None, format=None, exact=True, unit=None, infer_datetime_format=False, origin='unix', cache=True) pandas.to_timedelta(arg, unit=None, errors='raise') DataFrame.astype(dtype, copy=True, errors='raise') # 强制转换 DataFrame.convert_dtypes(infer_objects=True, convert_string=True, convert_integer=True, convert_boolean=True, convert_floating=True) # 使用支持 pd.NA 的 dtypes 将列转换为可能的最佳 dtypes。 ''' # 时间类型转换 df = pd.DataFrame({'year': [2015, 2016], 'month': [2, 3], 'day': [4, 5]}) pd.to_datetime(df) # 输出: 0 2015-02-04 1 2016-03-05 dtype: datetime64[ns] pd.to_datetime(1490195805, unit='s') # 输出: Timestamp('2017-03-22 15:16:45') pd.to_datetime('2015-02-04 4:05') # 输出: Timestamp('2015-02-04 04:05:00') df = pd.DataFrame([['2015.02.04 04:05:00'],['2015-02-04 4:5:5'],['2015-02-04 04:05:10']],columns=['time']) # 时间为字符串格式,不可直接求时间差 # df: time 0 2015.02.04 04:05:00 1 2015-02-04 4:5:5 2 2015-02-04 04:05:10 df['time']=pd.to_datetime(df['time'],format="%Y-%m-%d %H:%M:%S") # 格式转换 # df: time 0 2015-02-04 04:05:00 1 2015-02-04 04:05:05 2 2015-02-04 04:05:10
(9) 排序
1
2
3
4
5
6''' DataFrame.sort_index(axis=0, level=None, ascending=True, inplace=False, kind='quicksort', na_position='last', sort_remaining=True, ignore_index=False, key=None) # 按index/columns的名称 DataFrame.sort_values(by, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last', ignore_index=False, key=None) # Sort DataFrame by the value. '''
① DataFrame.sort_index() # 按行名/列名
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26df = pd.DataFrame({ 'col1': ['A', 'A', 'B', np.nan, 'D', 'C'], 'col4': [2, 1, 9, 8, 7, 4], 'col3': [0, 1, 9, 4, 2, 3], 'col2': ['a', 'B', 'c', 'D', 'e', 'F']}, index = [3,4,5,2,6,8]) df.sort_index() # 输出: col1 col4 col3 col2 2 NaN 8 4 D 3 A 2 0 a 4 A 1 1 B 5 B 9 9 c 6 D 7 2 e 8 C 4 3 F df.sort_index(axis=1) # 输出: col1 col2 col3 col4 3 A a 0 2 4 A B 1 1 5 B c 9 9 2 NaN D 4 8 6 D e 2 7 8 C F 3 4
② DataFrame.sort_values() # 按某行/列的值
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55df = pd.DataFrame({ 'col1': ['A', 'A', 'B', np.nan, 'D', 'C'], 'col2': [2, 1, 9, 8, 7, 4], 'col3': [0, 1, 9, 4, 2, 3], 'col4': ['a', 'B', 'c', 'D', 'e', 'F'] }) # df: col1 col2 col3 col4 0 A 2 0 a 1 A 1 1 B 2 B 9 9 c 3 NaN 8 4 D 4 D 7 2 e 5 C 4 3 F df.sort_values(by=['col1']) # 按一列排序 # 输出: col1 col2 col3 col4 0 A 2 0 a 1 A 1 1 B 2 B 9 9 c 5 C 4 3 F 4 D 7 2 e 3 NaN 8 4 D df.sort_values(by=['col1', 'col2']) # 按两列排序 # 输出: col1 col2 col3 col4 1 A 1 1 B 0 A 2 0 a 2 B 9 9 c 5 C 4 3 F 4 D 7 2 e 3 NaN 8 4 D df.sort_values(by='col1', ascending=False) # 降序 # 输出: col1 col2 col3 col4 4 D 7 2 e 5 C 4 3 F 2 B 9 9 c 0 A 2 0 a 1 A 1 1 B 3 NaN 8 4 D df.sort_values(by='col1', ascending=False, na_position='first') # 缺失值放在最前面 # 输出: col1 col2 col3 col4 3 NaN 8 4 D 4 D 7 2 e 5 C 4 3 F 2 B 9 9 c 0 A 2 0 a 1 A 1 1 B
(10) 行列命名、设置索引
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24''' 名称: DataFrame.index DataFrame.columns 重命名: DataFrame.rename(mapper=None, *, index=None, columns=None, axis=None, copy=True, inplace=False, level=None, errors='ignore') DataFrame.rename_axis(mapper=None, index=None, columns=None, axis=None, copy=True, inplace=False) DataFrame.set_axis(labels, axis=0, inplace=False) # 可以通过分配类似列表或索引来更改列或行标签的索引。 DataFrame.set_index(keys, drop=True, append=False, inplace=False, verify_integrity=False) # 使用现有列设置DataFrame索引。 DataFrame.reset_index(level=None, drop=False, inplace=False, col_level=0, col_fill='') # 重置index 特殊: '''
① DataFrame.index和DataFrame.columns # 直接更改
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) df.index=[2,3,4] # df: A B 2 1 4 3 2 5 4 3 6 df.columns=['a','b'] # df: a b 2 1 4 3 2 5 4 3 6
② DataFrame.rename() # 更改某行/某列名
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) df.rename(columns={"A": "a", "B": "c"}) # 输出: a c 0 1 4 1 2 5 2 3 6 df.rename(index={0: "x", 1: "y", 2: "z"}) # 输出: A B x 1 4 y 2 5 z 3 6 df.rename(str.lower, axis='columns') # 输出: a b 0 1 4 1 2 5 2 3 6 df.rename({1: 2, 2: 4}, axis='index') # 输出: A B 0 1 4 2 2 5 4 3 6
③ DataFrame.rename_axis() # 更改总的行/列的名称, 更适用于多级标题
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) df.rename_axis('数字',axis='index') # 输出: A B 数字 0 1 4 1 2 5 2 3 6 df.rename_axis('字母',axis='columns') # 输出: 字母 A B 0 1 4 1 2 5 2 3 6 df = pd.DataFrame({"num_legs": [4, 4, 2], "num_arms": [0, 0, 2]}, ["dog", "cat", "monkey"]) # 输出: num_legs num_arms dog 4 0 cat 4 0 monkey 2 2 df.index = pd.MultiIndex.from_product([['mammal'], ['dog', 'cat', 'monkey']], names=['type', 'ani_name']) # df: num_legs num_arms type ani_name mammal dog 4 0 cat 4 0 monkey 2 2 df.rename_axis(index={'type': 'class'}) # df: num_legs num_arms class ani_name mammal dog 4 0 cat 4 0 monkey 2 2
④ DataFrame.set_axis() 更改列或行标签的索引
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) df.set_axis(['a', 'b', 'c'], axis='index') A B a 1 4 b 2 5 c 3 6 df.set_axis(['I', 'II'], axis='columns') I II 0 1 4 1 2 5 2 3 6 df.set_axis(['i', 'ii'], axis='columns', inplace=True) df i ii 0 1 4 1 2 5 2 3 6
⑤ DataFrame.set_index() 设置索引,支持多级
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30df = pd.DataFrame({'month': [1, 4, 7, 10], 'year': [2012, 2014, 2013, 2014], 'sale': [55, 40, 84, 31]}) df.set_index('month') # 输出: year sale month 1 2012 55 4 2014 40 7 2013 84 10 2014 31 df.set_index(['year', 'month']) # 输出: sale year month 2012 1 55 2014 4 40 2013 7 84 2014 10 31 df.set_index([pd.Index([1, 2, 3, 4]), 'year']) # 输出: month sale year 1 2012 1 55 2 2014 4 40 3 2013 7 84 4 2014 10 31
⑥ DataFrame.reset_index() index更新
适用于删除某些行后更新index的序号,1,2,3…
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27''' DataFrame.reset_index(level=None, drop=False, inplace=False, col_level=0, col_fill='') ''' df = pd.DataFrame({ 'col1': ['A', 'A', 'B', np.nan, 'D', 'C'], 'col4': [2, 1, 9, 8, 7, 4], 'col3': [0, 1, 9, 4, 2, 3], 'col2': ['a', 'B', 'c', 'D', 'e', 'F']}, index = [3,4,5,2,6,8]) df.reset_index(drop=True,inplace=False) # 输出: col1 col4 col3 col2 0 A 2 0 a 1 A 1 1 B 2 B 9 9 c 3 NaN 8 4 D 4 D 7 2 e 5 C 4 3 F df1 = df.reset_index(drop=True,inplace=True) df 排序了,同上输出 df1 NoneType df.reset_index(inplace=True) # 多了一行index,新序号正确 df.reset_index(drop=True) # 没有多,但是序号不对 df.reset_index(drop=True,inplace=True) # 正确结果
pandas reset_index
⑦ DataFrame.reindex() 重排/填充/扩展数据
1
2
3
4
5
6
7
8
9# DataFrame.reindex() 使用可选的填充逻辑使 Series/DataFrame 符合新索引 ''' DataFrame.reindex(labels=None, index=None, columns=None, axis=None, method=None, copy=True, level=None, fill_value=nan, limit=None, tolerance=None) a. 可以改变行或列的排序方式 b. 填充NaN值 c. 扩展数据 ''' # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.reindex.html
(11) 数据统计
① 最值、均值、众数、方差等
1
2
3
4
5
6
7
8
9pandas.DataFrame.min() pandas.DataFrame.max() pandas.DataFrame.mode() pandas.DataFrame.median() pandas.DataFrame.mean() pandas.DataFrame.std() pandas.DataFrame.sum() ...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29df = pd.DataFrame([('bird', 2, 2), ('mammal', 4, np.nan), ('arthropod', 8, 0), ('bird', 2, np.nan)], index=('falcon', 'horse', 'spider', 'ostrich'), columns=('species', 'legs', 'wings')) df species legs wings falcon bird 2 2.0 horse mammal 4 NaN spider arthropod 8 0.0 ostrich bird 2 NaN # 0→列,1→行 df.min(axis = 0) # 输出: species arthropod legs 2 wings 0.0 dtype: object df.max(axis = 1) # 输出: falcon 2.0 horse 4.0 spider 8.0 ostrich 2.0 dtype: float64
② DataFrame.value_counts()
1
2
3DataFrame.value_counts(subset=None, normalize=False, sort=True, ascending=False, dropna=True) 统计数据出现的次数,可升降序,可对某列/行筛选
DataFrame.value_counts()
③ DataFrame.groupby()
1
2
3DataFrame.groupby(by=None, axis=0, level=None, as_index=True, sort=True, group_keys=True, squeeze=NoDefault.no_default, observed=False, dropna=True) 把数据分组
DataFrame.groupby()
④ DataFrameGroupBy.value_counts()
1
2DataFrameGroupBy.value_counts(subset=None, normalize=False, sort=True, ascending=False, dropna=True)[source]
DataFrameGroupBy.value_counts()
(12) 多级标题
① 读取含多级标题的文件
1
2df = pd.read_excel("POI.xlsx", header=[0,1], index_col=0)
通过header控制
② 建立多级标题的DataFrame
参考:很多方式
1
2
3
4
5
6
7
8
9
10
11''' pandas.MultiIndex(levels=None, codes=None, sortorder=None, names=None, dtype=None, copy=False, name=None, verify_integrity=True) 先设置MultiIndex 新建df,让index/columns = MultiIndex ''' # 一个实例: columns = [('0', 'formatted_address'), ('1', 'sematic_description'), ('POIs', 'name'), ('POIs', 'address'), ('POIs', 'poiType'), ('POIs', 'tag'), ('POIs', 'longitude'), ('POIs', 'latitude'), ('POIs', 'direction'), ('POIs', 'distance')] c = pd.MultiIndex.from_tuples(columns) df = pd.DataFrame(columns=c)
③ 取值/赋值,需要加括号
1
2df.at[0,('POIs', 'distance')] = 20
④ 更改标题名称
DataFrame.index
DataFrame.columns
DataFrame.rename()
DataFrame.rename_axis()
DataFrame.set_index()
(13) Dataframe转dictionary
1
2
3
4
5
6
7
8
9
10
11
12
13
14DataFrame.to_dict(orient='dict', into=<class 'dict'>) orientstr {‘dict’, ‘list’, ‘series’, ‘split’, ‘records’, ‘index’} Determines the type of the values of the dictionary. ‘dict’ (default) : dict like {column -> {index -> value}} ‘list’ : dict like {column -> [values]} ‘series’ : dict like {column -> Series(values)} ‘split’ : dict like {‘index’ -> [index], ‘columns’ -> [columns], ‘data’ -> [values]} ‘tight’ : dict like {‘index’ -> [index], ‘columns’ -> [columns], ‘data’ -> [values], ‘index_names’ -> [index.names], ‘column_names’ -> [column.names]} ‘records’ : list like [{column -> value}, … , {column -> value}] ‘index’ : dict like {index -> {column -> value}} Abbreviations are allowed. s indicates series and sp indicates split.
重点注意:
dict:key = col,value_key = row,value_value = value
index:key = row,value_key = col,value_value = value
list:key = col,value = 列的list
records:一行一个dict,key = col,value = value
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59df = pd.DataFrame({'col1': [1, 2], 'col2': [0.5, 0.75]}, index=['row1', 'row2']) # df: col1 col2 row1 1 0.50 row2 2 0.75 df.to_dict() # 输出: {'col1': {'row1': 1, 'row2': 2}, 'col2': {'row1': 0.5, 'row2': 0.75}} df.to_dict('dict') # 输出: {'col1': {'row1': 1, 'row2': 2}, 'col2': {'row1': 0.5, 'row2': 0.75}} df.to_dict('list') # 输出: {'col1': [1, 2], 'col2': [0.5, 0.75]} # 与list相同效果: tmp_dict = {col:df[col].tolist() for col in df.columns} print(tmp_dict) # 输出: {'col1': [1, 2], 'col2': [0.5, 0.75]} df.to_dict('series') # 输出: {'col1': row1 1 row2 2 Name: col1, dtype: int64, 'col2': row1 0.50 row2 0.75 Name: col2, dtype: float64} df.to_dict('split') # 输出: {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'], 'data': [[1, 0.5], [2, 0.75]]} df.to_dict('tight') # 输出: {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'], 'data': [[1, 0.5], [2, 0.75]], 'index_names': [None], 'column_names': [None]} df.to_dict('records') # 输出: [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}] df.to_dict('index') # 输出: {'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}}
4. 进制转换
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30# 二进制以0b或0B开头 # 八进制以0o或0O开头 # 十六进制以0x或0X开头 # 32/16/8/2进制转10进制: # 32进制32=2×32⁰+3×32¹ print(int('32', 32)) # 16进制a=10 print(int('0xa', 16)) # 8进制12=2×8⁰+1×8¹ print(int('0o12', 8)) # 2进制1010=0×2⁰+1×2¹+0×2²+1×2³ print(int('0b1010', 2)) # 开头的标志可以不写 print(int('a', 16)) print(int('12', 8)) print(int('1010', 2)) # 将十进制decimal system转换成二进制binary system print(bin(10)) # 将十进制decimal system转换成八进制octal system print(oct(10)) # 将十进制decimal system转换成十六进制hexadecimal system print(hex(10)) # 例子 m = '000AC949' print(int(m,16)) # 输出:706889 print(hex(706889)) # 输出:0xac949
bug和常用易错点
(1)读入长文本输出后末位数字变为000
① 读入文件时把将长文本列转换成str,输出excel后会没问题
1
2df = pd.read_excel(file,header=0,index_col=0,converters={"XXX": str})
② 将长文本列更改一下,输出csv会没问题
参考
1
2
3
4
5
6def num_out(data): data = str(data)+'t' return data df['XXX'] = df['XXX'].map(num_out)
(2)变量报错
在函数内部,给变量添加global修饰符,声明此变量为全局变量
(3)nan值判断
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18print(type(df.at[0,'Amap'])) # <class 'numpy.float64'> import pandas as pd import numpy as np import math if np.isnan(df.at[0,'Amap']): print('1') if math.isnan(df.at[0,'Amap']): print('1') if pd.isnull(df.at[0,'Amap']): print('1') # 输出: 1 1 1
(4)存在某值的某列取值
参考1
参考2
1
2
3
4
5
6
7
8
9
10
11
12
13# 判断某列是否存在某值: print(some_value in df['column_name'].values) # 取存在某值的某列中的行信息 df.loc[df['column_name'] == some_value] df.loc[df['column_name'].isin([some_value])] # 取存在特殊值的行的XX列信息 # 通过index print(df.at[df[df['column_name'] == some_value].index[0],'XX']) # 直接取值 print(df.loc[df['column_name'] == some_value,'XX'].values[0])
(5)为某列为某值的其他行复制
1
2
3# 为num列为2的行的类型列赋值为abc df.loc[:,'类型'].loc[df['num']==2] = 'abc'
(6)保存后格式发生变化
-
如果有时间格式数据,特别是计算了时间差,需要保存成csv
如果需要xlsx格式,就先保存成csv再读入保存成xlsx可正常显示。 -
列表类型保存后读入成为字符串
使用eval()转换
1
2df['list'] = pd.eval(df['list'])
特别注意
留坑
- reset_index
- reindex
- inplace=True
- 如果XXX中有inplace=True,不能写df=df.XXX
- 保留疑问
df.set_axis(['i', 'ii'], axis='columns', inplace=True)
df.reset_index(drop=True,inplace=True)
最后
以上就是负责小鸭子最近收集整理的关于数据处理代码记录数据处理的全部内容,更多相关数据处理代码记录数据处理内容请搜索靠谱客的其他文章。
发表评论 取消回复