python合并excel的多个sheet

92 阅读 0 评论 61 点赞

我是靠谱客的博主飘逸泥猴桃，这篇文章主要介绍python合并excel的多个sheet，现在分享给大家，希望可以做个参考。

文章目录

- 简介
- openpyxl方式
- pandas方式

简介

因为，每天都会有的大量excel报表汇总处理任务，所以写了一个脚本来处理。

就是找出每一个excel中特定的sheet，把这些sheet的特定列读取出来合并到一个sheet中。

因为每一个sheet的数据都不太一样，所以稍微麻烦一点，下面使用openpyxl方式和pandas两种方式来处理。

openpyxl方式

使用openpyxl方式要自己实现合并逻辑，要麻烦一些。值得注意的是，在excel中可能有公式，读取excel的时候可以使用下面的方式：

load_workbook(data_file_path, data_only=True)

使用data_only=True,就可以得到公式计算之后的值，而不是公式本身，因为公式本身合并在另一个sheet中，公式可能就无效，甚至不对了。

下面给一个示例代码，仅供参考：

复制代码

"""
pip install openpyxl
"""
from openpyxl import load_workbook
from openpyxl import Workbook
import os
import re
# 模板文件
TEMPLATE_FILE = r'H:合并合并模板.xlsx'
# 合并结果文件
RESULT_FILE = r'H:合并结果.xlsx'
# 数据文件目录
DATA_ROOT_DIR = r"H:合并"
# 文件名称正则
DATA_FILE_REG = r"(.*?)-合同导入台账d{8}.xlsx"
# 获取要处理的文件
def get_deal_file_map():
file_sn_map = {}
fs = os.listdir(DATA_ROOT_DIR)
for f in fs:
match = re.match(DATA_FILE_REG, f)
if match:
city = match.group(1)
sn = 2
if city == '成都':
sn = 4
elif city == '杭州':
sn = 3
file_sn_map[os.path.join(DATA_ROOT_DIR, f)] = sn
return file_sn_map
# 规范化列名
def get_normal_column_name(origin_col_name):
if origin_col_name:
start = origin_col_name.find("（")
if start == -1:
return origin_col_name.strip()
else:
return origin_col_name[0:start].strip()
# 获取列名与列坐标的映射
def get_col_name_coordinate_map(sheet_row):
name_coor_map = {}
for cell in sheet_row:
# name_coor_map[get_normal_column_name(cell.value)] = cell.column_letter
name_coor_map[get_normal_column_name(cell.value)] = cell.column
return name_coor_map
# 获取模板文件的列名与列坐标映射
def get_template_name_coordinate_map(template_file_path):
template_wbook = load_workbook(template_file_path)
table = template_wbook[template_wbook.sheetnames[0]]
row = table[1:1]
return get_col_name_coordinate_map(row)
def deal_data_content():
"""
合并文件内容
"""
dfile_sn_map = get_deal_file_map()
save_book = Workbook()
wsheet = save_book.active
wsheet.title = 'merge-data'
tmp_col_coor_map = get_template_name_coordinate_map(TEMPLATE_FILE)
wsheet.append(list(tmp_col_coor_map.keys()))
line = 2
for data_file_path in dfile_sn_map.keys():
sheet_num = dfile_sn_map[data_file_path]
wbook = load_workbook(data_file_path, data_only=True)
names = wbook.sheetnames
for i in range(0, sheet_num):
table = wbook[names[i]]
row = table[1:1]
data_col_coor_map = get_col_name_coordinate_map(row)
use_col = data_col_coor_map.keys() & tmp_col_coor_map.keys()
for row in table.iter_rows(min_row=2, values_only=True):
rcol_index = data_col_coor_map['城市']
city = row[rcol_index - 1]
if (city is None) or len(city.strip()) == 0:
continue
for col_name in use_col:
rcol_index = data_col_coor_map[col_name]
wcol_index = tmp_col_coor_map[col_name]
wsheet.cell(line, wcol_index, row[rcol_index - 1])
line += 1
save_book.save(RESULT_FILE)
if __name__ == '__main__':
deal_data_content()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
"""
pip install openpyxl
"""
from openpyxl import load_workbook
from openpyxl import Workbook
import os
import re
# 模板文件
TEMPLATE_FILE = r'H:合并合并模板.xlsx'
# 合并结果文件
RESULT_FILE = r'H:合并结果.xlsx'
# 数据文件目录
DATA_ROOT_DIR = r"H:合并"
# 文件名称正则
DATA_FILE_REG = r"(.*?)-合同导入台账d{8}.xlsx"
# 获取要处理的文件
def get_deal_file_map():
file_sn_map = {}
fs = os.listdir(DATA_ROOT_DIR)
for f in fs:
match = re.match(DATA_FILE_REG, f)
if match:
city = match.group(1)
sn = 2
if city == '成都':
sn = 4
elif city == '杭州':
sn = 3
file_sn_map[os.path.join(DATA_ROOT_DIR, f)] = sn
return file_sn_map
# 规范化列名
def get_normal_column_name(origin_col_name):
if origin_col_name:
start = origin_col_name.find("（")
if start == -1:
return origin_col_name.strip()
else:
return origin_col_name[0:start].strip()
# 获取列名与列坐标的映射
def get_col_name_coordinate_map(sheet_row):
name_coor_map = {}
for cell in sheet_row:
# name_coor_map[get_normal_column_name(cell.value)] = cell.column_letter
name_coor_map[get_normal_column_name(cell.value)] = cell.column
return name_coor_map
# 获取模板文件的列名与列坐标映射
def get_template_name_coordinate_map(template_file_path):
template_wbook = load_workbook(template_file_path)
table = template_wbook[template_wbook.sheetnames[0]]
row = table[1:1]
return get_col_name_coordinate_map(row)
def deal_data_content():
"""
合并文件内容
"""
dfile_sn_map = get_deal_file_map()
save_book = Workbook()
wsheet = save_book.active
wsheet.title = 'merge-data'
tmp_col_coor_map = get_template_name_coordinate_map(TEMPLATE_FILE)
wsheet.append(list(tmp_col_coor_map.keys()))
line = 2
for data_file_path in dfile_sn_map.keys():
sheet_num = dfile_sn_map[data_file_path]
wbook = load_workbook(data_file_path, data_only=True)
names = wbook.sheetnames
for i in range(0, sheet_num):
table = wbook[names[i]]
row = table[1:1]
data_col_coor_map = get_col_name_coordinate_map(row)
use_col = data_col_coor_map.keys() & tmp_col_coor_map.keys()
for row in table.iter_rows(min_row=2, values_only=True):
rcol_index = data_col_coor_map['城市']
city = row[rcol_index - 1]
if (city is None) or len(city.strip()) == 0:
continue
for col_name in use_col:
rcol_index = data_col_coor_map[col_name]
wcol_index = tmp_col_coor_map[col_name]
wsheet.cell(line, wcol_index, row[rcol_index - 1])
line += 1
save_book.save(RESULT_FILE)
if __name__ == '__main__':
deal_data_content()

pandas方式

相比于直接使用openpyxl的方式，使用pandas就方便多了，直接使用concat方法就可以了。

复制代码

1
2
pd.concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,keys=None, levels=None, names=None,verify_integrity=False,copy=True)

参数含义

参数	含义
objs	kist，Series、DataFrame、Panel
axis	默认为0，按行连接
join	inner、outer，默认为"outer"
keys	list，最外层构建层次索引，如果为多索引，使用元组
levels	list，用于构建MultiIndex的特定级别
names	list，结果层次索引中的级别的名称
copy	boolean，默认True。如果为False，请勿不必要地复制数据
join_axes	将要废弃，推荐在结果集上使用reindex
ignore_index	boolean，默认False。如果为True，忽略索引
verify_integrity	boolean，默认False。检查新连接的轴是否包含重复项

下面直接看示例：

复制代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# coding:utf-8
import pandas as pd
# 读取指定文件的指定sheet
df1 = pd.read_excel(r'H:mergecd-contract-charge-1-20200807.xlsx', header=0, sheet_name=0)
df2 = pd.read_excel(r'H:mergecd-contract-charge-2-20200807.xlsx', header=0, sheet_name=1)
df3 = pd.read_excel(r'H:mergecd-contract-charge-3-20200807.xlsx', header=0, sheet_name=2)
df4 = pd.read_excel(r'H:mergehz-contract-charge-1-20200807.xlsx', header=0, sheet_name=0)
df5 = pd.read_excel(r'H:mergehz-contract-charge-2-20200807.xlsx', header=0, sheet_name=1)
# 按行拼接
data = pd.concat([df1, df2, df3, df4, df5], sort=False, ignore_index=True)
# 选择需要的列
header = ['日期', '合同号', '城市', '姓名', 'charge']
data = data.loc[:, header]
# 将结果写到值得excel文件
data.to_excel(r'H:mergeresult.xlsx', index=False)