pandas 处理 Excel 文件并将结果写入数据库中

130 阅读 0 评论 86 点赞

我是靠谱客的博主自信红牛，这篇文章主要介绍pandas 处理 Excel 文件并将结果写入数据库中，现在分享给大家，希望可以做个参考。

需要注意的是pandas处理数据，数字格式的null 会被显示为nan,所以需要简单的处理一下：

复制代码

1
2
str_obj.replace("nan", "null")

完整代码如下：

复制代码

# -*- coding: utf-8 -*-

import pandas as pd
from datetime import datetime
from impala.dbapi import connect
from impala.util import as_pandas
import os

file = r'E:\日报{}.{}.xlsx'

def import_hive(sql):
    conn = connect(host='host', port=xx)
    cur = conn.cursor()
    print('要执行的SQL是' + sql)
    cur.execute(sql)
    conn.close()

tsk_sql = 'INSERT INTO TABLE xxx VALUES'
amt_sql = 'INSERT INTO TABLE xxx VALUES'

def parse_from_excel(year: int, month: int, day_range: range):
    for i in day_range:
        this_day = datetime(year=year, month=month, day=i)
        mon = this_day.strftime('%Y-%m')
        filename = file.format(this_day.month, this_day.day)
        if os.path.exists(filename):
            df = pd.read_excel(filename, 0, header=None)
            v0 = df.iloc[2:9, [0]].values  
            v1 = df.iloc[2:9, [1]].values  
            v2 = df.iloc[2:9, [2]].values  
            v3 = df.iloc[2:9, [3]].values  
            v4 = df.iloc[2:9, [4]].values  
            tsk_lists = []
            amt_lists = []
         
            for z in range(0, len(v1)):
                tsk_val = "('%s','%s','%s')" % (mon, v0[z][0], str(round(v3[z][0], 6)))
                amt_val = "('%s','%s',%0.6f,%0.6f,%0.4f)" % (
                    this_day.date().__str__(), v0[z][0], float(round(v1[z][0], 6)),
                    round(v2[z][0], 6), round(v4[z][0], 4))

amt_lists.append(amt_val)
                tsk_lists.append(tsk_val)

tsk_val_link = tsk_sql + ','.join(tsk_lists)
            amt_val_link = amt_sql + ','.join(amt_lists)

import_hive(amt_val_link.replace("nan", "null"))

# 只有每月的1号才会写入销售任务
            if this_day.day == 10:
                import_hive(tsk_val_link.replace("nan", "null"))
        else:
            print('%s文件不存在' % filename)

if __name__ == '__main__':
    parse_from_excel(year=2021, month=4, day_range=range(1, 11))

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# -*- coding: utf-8 -*-

import pandas as pd
from datetime import datetime
from impala.dbapi import connect
from impala.util import as_pandas
import os

file = r'E:\日报{}.{}.xlsx'


def import_hive(sql):
    conn = connect(host='host', port=xx)
    cur = conn.cursor()
    print('要执行的SQL是' + sql)
    cur.execute(sql)
    conn.close()


tsk_sql = 'INSERT INTO TABLE xxx VALUES'
amt_sql = 'INSERT INTO TABLE xxx VALUES'

def parse_from_excel(year: int, month: int, day_range: range):
    for i in day_range:
        this_day = datetime(year=year, month=month, day=i)
        mon = this_day.strftime('%Y-%m')
        filename = file.format(this_day.month, this_day.day)
        if os.path.exists(filename):
            df = pd.read_excel(filename, 0, header=None)
            v0 = df.iloc[2:9, [0]].values  
            v1 = df.iloc[2:9, [1]].values  
            v2 = df.iloc[2:9, [2]].values  
            v3 = df.iloc[2:9, [3]].values  
            v4 = df.iloc[2:9, [4]].values  
            tsk_lists = []
            amt_lists = []
         
            for z in range(0, len(v1)):
                tsk_val = "('%s','%s','%s')" % (mon, v0[z][0], str(round(v3[z][0], 6)))
                amt_val = "('%s','%s',%0.6f,%0.6f,%0.4f)" % (
                    this_day.date().__str__(), v0[z][0], float(round(v1[z][0], 6)),
                    round(v2[z][0], 6), round(v4[z][0], 4))

                amt_lists.append(amt_val)
                tsk_lists.append(tsk_val)

            tsk_val_link = tsk_sql + ','.join(tsk_lists)
            amt_val_link = amt_sql + ','.join(amt_lists)

            import_hive(amt_val_link.replace("nan", "null"))

            # 只有每月的1号才会写入销售任务
            if this_day.day == 10:
                import_hive(tsk_val_link.replace("nan", "null"))
        else:
            print('%s文件不存在' % filename)


if __name__ == '__main__':
    parse_from_excel(year=2021, month=4, day_range=range(1, 11))