python 爬虫并把网页中的文字写到txt中

48 阅读 0 评论 32 点赞

我是靠谱客的博主合适大树，这篇文章主要介绍python 爬虫并把网页中的文字写到txt中，现在分享给大家，希望可以做个参考。

import requests
from bs4 import BeautifulSoup
from lxml import etree
import re
link = 'http://www.chinanews.com/gn/2020/03-18/9129903.shtml'
header = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'
}
r = requests.get(link, headers=header, timeout=20)
r.encoding = 'utf-8'

html = etree.HTML(r.text)
page = html.xpath('//div[@class="left_zw"]/p/text()')
file = open('text2.txt', 'w+')
for i in page:
    i=i.replace('u3000', ' ').replace('xa0', ' ').strip()
    if len(i)!=0:
        file.write(i)
file.close()

ls=[x for x in open('text2.txt',encoding='gbk',mode='r')]
str1=ls[0]
str1=re.sub(r'.*社.*d{1,2}月d{1,2}日电',' ',str1).strip()
file=open('text2.txt',encoding='gbk',mode='w')
file.write(str1)
file.close()