概述
#coding:utf-8
importrandom, reimportsqlite3importjsonfrom bs4 importBeautifulSoupimportsys
reload(sys)
sys.setdefaultencoding('utf-8')importuuidimportrequests
session=requests.session()defmd5(str):importhashlib
m=hashlib.md5()
m.update(str)returnm.hexdigest()defwangyi():for i in range(1,3):if i ==1:
k= ""
else:
k= "_0" +str(i)
url= "http://temp.163.com/special/00804KVA/cm_yaowen" + k + ".js?callback=data_callback"
printurl
headers={"Host":"temp.163.com","Connection":"keep-alive","Accept":"*/*","User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36 LBBROWSER","Referer":"http://news.163.com/","Accept-Encoding":"gzip, deflate, sdch","Accept-Language":"zh-CN,zh;q=0.8",
}
result= session.get(url=url,headers=headers).texttry:
result1= eval(eval((json.dumps(result)).replace('data_callback(','').replace(')','').replace(' ','')))except:pass
try:for i inresult1:
tlink= i['tlink']
headers2={"Host":"news.163.com","Connection":"keep-alive","Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8","Upgrade-Insecure-Requests":"1","User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36 LBBROWSER","Accept-Encoding":"gzip, deflate, sdch","Accept-Language":"zh-CN,zh;q=0.8",
}print "tlinktlinktlinktlink",tlink
return_data= session.get(url=tlink,headers=headers2).texttry:
soup= BeautifulSoup(return_data, 'html.parser')
returnSoup= soup.find_all("div", attrs={"id": "endText"})[0]printreturnSoupprint "==============================="
try:
returnList= re.findall('
(.*?)
',str(returnSoup))content1= ''.join(returnList)except:
content1=""
try:
returnList1= re.findall('
(.*?)
',str(returnSoup))content2= ''.join(returnList1)except:
content2=""content= content1 +content2except:
content= ""cx= sqlite3.connect("C:\Users\xuchunlin\PycharmProjects\study\db.sqlite3", check_same_thread=False)
cx.text_factory=strtry:print "正在插入链接 %s 数据" %(url)
tlink= i['tlink']
title= (i['title']).decode('unicode_escape')
commenturl= i['commenturl']
tienum= i['tienum']
opentime= i['time']printtitleprinttlinkprintcommenturlprinttienumprintopentimeprintcontent
url2=md5(str(tlink))
cx.execute("INSERT INTO wangyi (title,tlink,commenturl,tienum,opentime,content,url)VALUES (?,?,?,?,?,?,?)",(str(title), str(tlink), str(commenturl), str(tienum), str(opentime), str(content), str(url2)))exceptException as e:printeprint "cha ru shi bai"cx.commit()
cx.close()except:passwangyi()
最后
以上就是炙热百褶裙为你收集整理的python爬取网易新闻_Python 爬虫实例(4)—— 爬取网易新闻的全部内容,希望文章能够帮你解决python爬取网易新闻_Python 爬虫实例(4)—— 爬取网易新闻所遇到的程序开发问题。
如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。
发表评论 取消回复