Pandas读取.db数据库信息,并处理数据

92 阅读 0 评论 61 点赞

我是靠谱客的博主高挑薯片，这篇文章主要介绍Pandas读取.db数据库信息,并处理数据，现在分享给大家，希望可以做个参考。

import sqlite3
con = sqlite3.connect("chinook.db")
tracks = pd.read_sql_query("SELECT * FROM tracks", con)
tracks.head()

	TrackId	Name	AlbumId	MediaTypeId	GenreId	Composer	Milliseconds	Bytes	UnitPrice
0	1	For Those About To Rock (We Salute You)	1	1	1	Angus Young, Malcolm Young, Brian Johnson	343719	11170334	0.99
1	2	Balls to the Wall	2	2	1	None	342562	5510424	0.99
2	3	Fast As a Shark	3	2	1	F. Baltes, S. Kaufman, U. Dirkscneider & W. Ho...	230619	3990994	0.99
3	4	Restless and Wild	3	2	1	F. Baltes, R.A. Smith-Diesel, S. Kaufman, U. D...	252051	4331779	0.99
4	5	Princess of the Dawn	3	2	1	Deaffy & R.A. Smith-Diesel	375418	6290521	0.99

# read_sql_query函数可以读取一张表，第一个参数是表名，第二个参数是引擎
genres = pd.read_sql_query("SELECT * FROM genres", con)
genres.head()

	GenreId	Name
0	1	Rock
1	2	Jazz
2	3	Metal
3	4	Alternative & Punk
4	5	Rock And Roll

genre_track = genres.merge(tracks[['GenreId', 'Milliseconds']],
                           on='GenreId', how='left').drop('GenreId', axis='columns')
genre_track.head()

	GenreId	Name
0	1	Rock
1	2	Jazz
2	3	Metal
3	4	Alternative & Punk
4	5	Rock And Roll

# 找到每种类型歌曲的平均时长
genre_time = genre_track.groupby('Name')['Milliseconds'].mean()
pd.to_timedelta(genre_time, unit='ms').dt.floor('s').sort_values()

Name
Rock And Roll        00:02:14
Opera                00:02:54
Hip Hop/Rap          00:02:58
Easy Listening       00:03:09
Bossa Nova           00:03:39
R&B/Soul             00:03:40
World                00:03:44
Pop                  00:03:49
Latin                00:03:52
Alternative & Punk   00:03:54
Soundtrack           00:04:04
Reggae               00:04:07
Alternative          00:04:24
Blues                00:04:30
Rock                 00:04:43
Jazz                 00:04:51
Classical            00:04:53
Heavy Metal          00:04:57
Electronica/Dance    00:05:02
Metal                00:05:09
Comedy               00:26:25
TV Shows             00:35:45
Drama                00:42:55
Science Fiction      00:43:45
Sci Fi & Fantasy     00:48:31
Name: Milliseconds, dtype: timedelta64[ns]

#找到每名顾客花费的总时长
cust = pd.read_sql_query("SELECT * FROM customers", con)                    
cust=cust[['CustomerId', 'FirstName', 'LastName']]

invoice = pd.read_sql_query("SELECT * FROM invoices", con)                    
invoice=invoice[['InvoiceId','CustomerId']]

ii = pd.read_sql_query("SELECT * FROM invoice_items", con)                    
ii=ii[['InvoiceId', 'UnitPrice', 'Quantity']]

cust_inv = cust.merge(invoice, on='CustomerId') 
                         .merge(ii, on='InvoiceId')
cust_inv.head()

	CustomerId	FirstName	LastName	InvoiceId	UnitPrice	Quantity
0	1	Luís	Gonçalves	98	1.99	1
1	1	Luís	Gonçalves	98	1.99	1
2	1	Luís	Gonçalves	121	0.99	1
3	1	Luís	Gonçalves	121	0.99	1
4	1	Luís	Gonçalves	121	0.99	1

# 现在可以用总量乘以单位价格，找到每名顾客的总消费
total = cust_inv['Quantity'] * cust_inv['UnitPrice']
cols = ['CustomerId', 'FirstName', 'LastName']
cust_inv.assign(Total = total).groupby(cols)['Total'].sum().sort_values(ascending=False).head()

CustomerId  FirstName  LastName  
6           Helena     Holý          49.62
26          Richard    Cunningham    47.62
57          Luis       Rojas         46.62
46          Hugh       O'Reilly      45.62
45          Ladislav   Kovács        45.62
Name: Total, dtype: float64

# sql语句查询方法read_sql_query
pd.read_sql_query('select * from tracks limit 5', con)

	TrackId	Name	AlbumId	MediaTypeId	GenreId	Composer	Milliseconds	Bytes	UnitPrice
0	1	For Those About To Rock (We Salute You)	1	1	1	Angus Young, Malcolm Young, Brian Johnson	343719	11170334	0.99
1	2	Balls to the Wall	2	2	1	None	342562	5510424	0.99
2	3	Fast As a Shark	3	2	1	F. Baltes, S. Kaufman, U. Dirkscneider & W. Ho...	230619	3990994	0.99
3	4	Restless and Wild	3	2	1	F. Baltes, R.A. Smith-Diesel, S. Kaufman, U. D...	252051	4331779	0.99
4	5	Princess of the Dawn	3	2	1	Deaffy & R.A. Smith-Diesel	375418	6290521	0.99

# 可以将长字符串传给read_sql_query
sql_string1 = '''
          select 
              Name, 
              time(avg(Milliseconds) / 1000, 'unixepoch') as avg_time
          from (
                  select 
                      g.Name, 
                      t.Milliseconds
                  from 
                      genres as g 
                  join
                      tracks as t
                      on 
                          g.genreid == t.genreid
              )
          group by 
              Name
          order by 
              avg_time
          '''
pd.read_sql_query(sql_string1, con).head()

	Name	avg_time
0	Rock And Roll	00:02:14
1	Opera	00:02:54
2	Hip Hop/Rap	00:02:58
3	Easy Listening	00:03:09
4	Bossa Nova	00:03:39

sql_string2 = '''
select 
                c.customerid, 
                c.FirstName, 
                c.LastName, 
                sum(ii.quantity *  ii.unitprice) as Total
          from
                customers as c
          join
                invoices as i
                     on c.customerid = i.customerid
          join
                invoice_items as ii
                     on i.invoiceid = ii.invoiceid
          group by
                c.customerid, c.FirstName, c.LastName
          order by
                Total desc
          '''
pd.read_sql_query(sql_string2, con).head()