概述
#! /usr/bin/env python
#coding=utf8
# by meichenhui@gmail.com 2010/5/30
from HTMLParser import HTMLParser
from pyquery import PyQuery as pq
import sqlite3,urllib2,logging,sys
from datetime import datetime
from decimal import Decimal
# 日志初始化
logFileName='./%s.log'%(datetime.now().strftime("%Y%m%d%H%M%S"))
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
datefmt='%m-%d %H:%M',
filename=logFileName,
filemode='w')
logger = logging.getLogger('transchedule')
hdlr = logging.StreamHandler(sys.stdout)
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr)
logger.setLevel(logging.DEBUG)
global witchTrain # 所属当前页面列车的序列号
witchTrain = 0
# 初始化数据库
conn = sqlite3.connect('trainsInfo.sqlite')
c = conn.cursor()
def getTrainInfo(i,e):
result=[]
# 取车次
trainNumbers=pq(e)('tr td').eq(2).text().split(' ')
for oneTNum in trainNumbers:
result.append([oneTNum])
# 运行时间
runtimeMeta=pq(e)('tr td').eq(4).text().split(' ')
for (counter,oneRuntime) in enumerate(runtimeMeta):
runtime = int(oneRuntime.split(u'时')[0])*60+int(oneRuntime.split(u'时')[1][0:-1])
result[counter].append(runtime)
# 始发站
startingStations=pq(e)('tr').eq(1)('td').eq(1).text().split(' ')
for (counter,oneStartingStation) in enumerate(startingStations):
result[counter].append(oneStartingStation)
# 终点站
terminatingStations=pq(e)('tr').eq(1)('td').eq(3).text().split(' ')
for (counter,oneTerminatingStation) in enumerate(terminatingStations):
result[counter].append(oneTerminatingStation)
# 始发时间
departureTimes=pq(e)('tr').eq(2)('td').eq(1).text().split(' ')
for (counter,oneDepartureTime) in enumerate(departureTimes):
result[counter].append(datetime.strptime(oneDepartureTime,'%H:%M'))
# 到达时间
arrivalTimes=pq(e)('tr').eq(2)('td').eq(3).text().split(' ')
for (counter,oneArrivalTime) in enumerate(arrivalTimes):
result[counter].append(datetime.strptime(oneArrivalTime,'%H:%M'))
# 类型
clazzes=pq(e)('tr').eq(3)('td').eq(1).text().split(' ')
for (counter,oneClazz) in enumerate(clazzes):
result[counter].append(oneClazz)
# 全程
ranges=pq(e)('tr').eq(3)('td').eq(3).text().split(' ')
for (counter,oneRange) in enumerate(ranges):
result[counter].append(int(oneRange[0:-2]))
return result
def insertTrainInfo(trainData,cursor):
try:
cursor.execute('insert into trains_Info values(?,?,?,?,?,?,?,?)',trainData)
logger.info("train number %s processed"%trainData[0])
except Exception,e:
logger.error("%s %s"%(e,trainData[0]))
def getScheduleInfo(i,e):
global witchTrain # 所属当前页面列车的序列号
td = pq(e)('td')
if td.eq(0).text() in ('No.',""):
witchTrain += 1
return
# 解析异常处理
if len(td) == 2:
logger.error("%s:%s"%(td.text().encode('gb18030'),len(td)))
return
# 停车时间
stopTime = datetime.strptime("00:00",'%H:%M')
try:
stopTime = datetime.strptime(td.eq(5).text(),'%H:%M')
except Exception:
#print 'stop time parse error:%s:%s'%(td.eq(5).text(),td.eq(0).text())
None
# 开车时间
startTime = datetime.strptime("00:00",'%H:%M')
try:
startTime = datetime.strptime(td.eq(6).text(),'%H:%M')
except Exception:
#print 'start time parse error:%s:%s'%(td.eq(6).text(),td.eq(0).text())
None
# 里程
range = int(td.eq(7).text()[:-2])
# 硬座
hardSeatPrice=0.0
if td.eq(8) and len(td.eq(8).text()) > 1:
hardSeatPrice=td.eq(8).text()[:-1]
# 硬卧中铺
hardBerthPrice=0.0
if td.eq(9) and len(td.eq(9).text()) > 1 and td.eq(9).text()[:-1] <> "-":
hardBerthPrice=td.eq(9).text()[:-1]
# 软座
softSeatPrice=0.0
if td.eq(10) and len(td.eq(10).text()) > 1:
softSeatPrice=td.eq(10).text()[:-1]
# 软卧下铺
softBerthPrice='0'
if td.eq(11) and len(td.eq(11).text()) > 1:
softBerth=td.eq(11).text()[:-1]
return [[witchTrain,
int(td.eq(0).text()),td.eq(1).text(),
td.eq(4).text(),stopTime,
startTime,range,
hardSeatPrice,hardBerthPrice,
softSeatPrice,softBerthPrice,]]
def insertTrainSchedule(trainInfos,scheduleData,cursor):
scheduleData[0]=trainInfos[scheduleData[0]-1][0]
try:
cursor.execute('insert into trains_schedule values(?,?,?,?,?,?,?,?,?,?,?)',scheduleData)
except Exception,e:
logger.error("%s %s"%(e,scheduleData[0]))
def getNextPageLink(e):
d = pq(e)
if d.text() == u"下一页":
return d.attr('href')
# 获取页面内的车次
def processTrainsInPage(url):
# 解析指定的连接
d = pq(url=url)
# 取得车次列表
lis = d('body center div.ListContent div.ListContentLeft ul li')
lis.make_links_absolute()
trains = lis.map(lambda i,e:pq(e)('a').attr('href'))
# 循环处理每个车次的时刻表
for oneTrain in trains:
pageContent=urllib2.urlopen(oneTrain).read().replace("gb2312","gb18030")
# 列车信息
trainInfo = pq(pageContent)('body center div.ResultContent div.ResultContentLeft div.ResultContentLeftContent
div.ResultTrainCodeContent table').eq(1)
trainInfos = trainInfo.map(getTrainInfo)
for oneTrainInfo in trainInfos:
insertTrainInfo(oneTrainInfo,c)
conn.commit()
# 列车途经站点
trainSchedule = pq(pageContent)('body center div.ResultContent div.ResultContentLeft div.ResultContentLeftContent
div.ResultTrainCodeContent table').eq(2)('tr')
trainSchedules = trainSchedule.map(getScheduleInfo)
global witchTrain
witchTrain = 0
for oneTrainSchedule in trainSchedules:
insertTrainSchedule(trainInfos,oneTrainSchedule,c)
conn.commit()
# 递归处理下一页
nextPageTable = d('body center div.ListContent div.ListContentLeft div.ListContentLeftContent').eq(2)('a')
nextPageTable.make_links_absolute()
lis=nextPageTable.map(lambda i,e:getNextPageLink(e))
if lis[0] == url:
logger.info(lis[0].encode('gb18030'))
else:
logger.info("start process %s"%lis[0])
processTrainsInPage(lis[0])
logger.info("start get data...")
try:
processTrainsInPage(r"http://www.tielu.org/TrainList/TrainList-1.html")
except Exception,e:
logger.error(e)
logger.info("finish...")
最后
以上就是小巧长颈鹿为你收集整理的python爬取网页表格_python网页爬虫之列车时刻表的抓取(4)-完整的python脚本的全部内容,希望文章能够帮你解决python爬取网页表格_python网页爬虫之列车时刻表的抓取(4)-完整的python脚本所遇到的程序开发问题。
如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。
发表评论 取消回复