我是靠谱客的博主 小巧长颈鹿,最近开发中收集的这篇文章主要介绍python爬取网页表格_python网页爬虫之列车时刻表的抓取(4)-完整的python脚本,觉得挺不错的,现在分享给大家,希望可以做个参考。

概述

#! /usr/bin/env python

#coding=utf8

# by meichenhui@gmail.com 2010/5/30

from HTMLParser import HTMLParser

from pyquery import PyQuery as pq

import sqlite3,urllib2,logging,sys

from datetime import datetime

from decimal import Decimal

# 日志初始化

logFileName='./%s.log'%(datetime.now().strftime("%Y%m%d%H%M%S"))

logging.basicConfig(level=logging.INFO,

format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',

datefmt='%m-%d %H:%M',

filename=logFileName,

filemode='w')

logger = logging.getLogger('transchedule')

hdlr = logging.StreamHandler(sys.stdout)

formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')

hdlr.setFormatter(formatter)

logger.addHandler(hdlr)

logger.setLevel(logging.DEBUG)

global witchTrain # 所属当前页面列车的序列号

witchTrain = 0

# 初始化数据库

conn = sqlite3.connect('trainsInfo.sqlite')

c = conn.cursor()

def getTrainInfo(i,e):

result=[]

# 取车次

trainNumbers=pq(e)('tr td').eq(2).text().split(' ')

for oneTNum in trainNumbers:

result.append([oneTNum])

# 运行时间

runtimeMeta=pq(e)('tr td').eq(4).text().split(' ')

for (counter,oneRuntime) in enumerate(runtimeMeta):

runtime = int(oneRuntime.split(u'时')[0])*60+int(oneRuntime.split(u'时')[1][0:-1])

result[counter].append(runtime)

# 始发站

startingStations=pq(e)('tr').eq(1)('td').eq(1).text().split(' ')

for (counter,oneStartingStation) in enumerate(startingStations):

result[counter].append(oneStartingStation)

# 终点站

terminatingStations=pq(e)('tr').eq(1)('td').eq(3).text().split(' ')

for (counter,oneTerminatingStation) in enumerate(terminatingStations):

result[counter].append(oneTerminatingStation)

# 始发时间

departureTimes=pq(e)('tr').eq(2)('td').eq(1).text().split(' ')

for (counter,oneDepartureTime) in enumerate(departureTimes):

result[counter].append(datetime.strptime(oneDepartureTime,'%H:%M'))

# 到达时间

arrivalTimes=pq(e)('tr').eq(2)('td').eq(3).text().split(' ')

for (counter,oneArrivalTime) in enumerate(arrivalTimes):

result[counter].append(datetime.strptime(oneArrivalTime,'%H:%M'))

# 类型

clazzes=pq(e)('tr').eq(3)('td').eq(1).text().split(' ')

for (counter,oneClazz) in enumerate(clazzes):

result[counter].append(oneClazz)

# 全程

ranges=pq(e)('tr').eq(3)('td').eq(3).text().split(' ')

for (counter,oneRange) in enumerate(ranges):

result[counter].append(int(oneRange[0:-2]))

return result

def insertTrainInfo(trainData,cursor):

try:

cursor.execute('insert into trains_Info values(?,?,?,?,?,?,?,?)',trainData)

logger.info("train number %s processed"%trainData[0])

except Exception,e:

logger.error("%s %s"%(e,trainData[0]))

def getScheduleInfo(i,e):

global witchTrain # 所属当前页面列车的序列号

td = pq(e)('td')

if td.eq(0).text() in ('No.',""):

witchTrain += 1

return

# 解析异常处理

if len(td) == 2:

logger.error("%s:%s"%(td.text().encode('gb18030'),len(td)))

return

# 停车时间

stopTime = datetime.strptime("00:00",'%H:%M')

try:

stopTime = datetime.strptime(td.eq(5).text(),'%H:%M')

except Exception:

#print 'stop time parse error:%s:%s'%(td.eq(5).text(),td.eq(0).text())

None

# 开车时间

startTime = datetime.strptime("00:00",'%H:%M')

try:

startTime = datetime.strptime(td.eq(6).text(),'%H:%M')

except Exception:

#print 'start time parse error:%s:%s'%(td.eq(6).text(),td.eq(0).text())

None

# 里程

range = int(td.eq(7).text()[:-2])

# 硬座

hardSeatPrice=0.0

if td.eq(8) and len(td.eq(8).text()) > 1:

hardSeatPrice=td.eq(8).text()[:-1]

# 硬卧中铺

hardBerthPrice=0.0

if td.eq(9) and len(td.eq(9).text()) > 1 and td.eq(9).text()[:-1] <> "-":

hardBerthPrice=td.eq(9).text()[:-1]

# 软座

softSeatPrice=0.0

if td.eq(10) and len(td.eq(10).text()) > 1:

softSeatPrice=td.eq(10).text()[:-1]

# 软卧下铺

softBerthPrice='0'

if td.eq(11) and len(td.eq(11).text()) > 1:

softBerth=td.eq(11).text()[:-1]

return [[witchTrain,

int(td.eq(0).text()),td.eq(1).text(),

td.eq(4).text(),stopTime,

startTime,range,

hardSeatPrice,hardBerthPrice,

softSeatPrice,softBerthPrice,]]

def insertTrainSchedule(trainInfos,scheduleData,cursor):

scheduleData[0]=trainInfos[scheduleData[0]-1][0]

try:

cursor.execute('insert into trains_schedule values(?,?,?,?,?,?,?,?,?,?,?)',scheduleData)

except Exception,e:

logger.error("%s %s"%(e,scheduleData[0]))

def getNextPageLink(e):

d = pq(e)

if d.text() == u"下一页":

return d.attr('href')

# 获取页面内的车次

def processTrainsInPage(url):

# 解析指定的连接

d = pq(url=url)

# 取得车次列表

lis = d('body center div.ListContent div.ListContentLeft ul li')

lis.make_links_absolute()

trains = lis.map(lambda i,e:pq(e)('a').attr('href'))

# 循环处理每个车次的时刻表

for oneTrain in trains:

pageContent=urllib2.urlopen(oneTrain).read().replace("gb2312","gb18030")

# 列车信息

trainInfo = pq(pageContent)('body center div.ResultContent div.ResultContentLeft div.ResultContentLeftContent

div.ResultTrainCodeContent table').eq(1)

trainInfos = trainInfo.map(getTrainInfo)

for oneTrainInfo in trainInfos:

insertTrainInfo(oneTrainInfo,c)

conn.commit()

# 列车途经站点

trainSchedule = pq(pageContent)('body center div.ResultContent div.ResultContentLeft div.ResultContentLeftContent

div.ResultTrainCodeContent table').eq(2)('tr')

trainSchedules = trainSchedule.map(getScheduleInfo)

global witchTrain

witchTrain = 0

for oneTrainSchedule in trainSchedules:

insertTrainSchedule(trainInfos,oneTrainSchedule,c)

conn.commit()

# 递归处理下一页

nextPageTable = d('body center div.ListContent div.ListContentLeft div.ListContentLeftContent').eq(2)('a')

nextPageTable.make_links_absolute()

lis=nextPageTable.map(lambda i,e:getNextPageLink(e))

if lis[0] == url:

logger.info(lis[0].encode('gb18030'))

else:

logger.info("start process %s"%lis[0])

processTrainsInPage(lis[0])

logger.info("start get data...")

try:

processTrainsInPage(r"http://www.tielu.org/TrainList/TrainList-1.html")

except Exception,e:

logger.error(e)

logger.info("finish...")

最后

以上就是小巧长颈鹿为你收集整理的python爬取网页表格_python网页爬虫之列车时刻表的抓取(4)-完整的python脚本的全部内容,希望文章能够帮你解决python爬取网页表格_python网页爬虫之列车时刻表的抓取(4)-完整的python脚本所遇到的程序开发问题。

如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。

本图文内容来源于网友提供,作为学习参考使用,或来自网络收集整理,版权属于原作者所有。
点赞(30)

评论列表共有 0 条评论

立即
投稿
返回
顶部