概述
问题如题:
改成while,试了很多,然没有效果,请教大家
# coding:utf-8
#
from lxml import etree
import requests,lxml.html,os
class MyError(Exception):
def __init__(self, value):
self.value = value
def __str__(self):
return repr(self.value)
def get_lawyers_info(url):
r = requests.get(url)
html = lxml.html.fromstring(r.content)
# phones = html.xpath('//span[@class="law-tel"]')
phones = html.xpath('//span[@class="phone pull-right"]')
# names = html.xpath('//div[@class="fl"]/p/a')
names = html.xpath('//h4[@class="text-center"]')
if(len(phones) == len(names)):
list(zip(names,phones))
phone_infos = [(names[i].text, phones[i].text_content()) for i in range(len(names))]
else:
error = "Lawyers amount are not equal to the amount of phone_nums: "+url
raise MyError(error)
phone_infos_list = []
for phone_info in phone_infos:
if(phone_info[0] == ""):
info = "没留姓名"+": "+phone_info[1]+"rn"
else:
info = phone_info[0]+": "+phone_info[1]+"rn"
print (info)
phone_infos_list.append(info)
return phone_infos_list
dir_path = os.path.abspath(os.path.dirname(__file__))
print (dir_path)
file_path = os.path.join(dir_path,"lawyers_info.txt")
print (file_path)
if os.path.exists(file_path):
os.remove(file_path)
with open("lawyers_info.txt","ab") as file:
for i in range(1000):
url = "http://www.xxxx.com/cooperative_merchants?searchText=&industry=100&provinceId=19&cityId=0&areaId=0&page="+str(i+1)
# r = requests.get(url)
# html = lxml.html.fromstring(r.content)
# phones = html.xpath('//span[@class="phone pull-right"]')
# names = html.xpath('//h4[@class="text-center"]')
# if phones or names:
info = get_lawyers_info(url)
for each in info:
file.write(each.encode("gbk"))
最后
以上就是壮观小鸽子为你收集整理的python 自增爬去网页_Python3.6 下的爬虫总是重复爬第一页的内容的全部内容,希望文章能够帮你解决python 自增爬去网页_Python3.6 下的爬虫总是重复爬第一页的内容所遇到的程序开发问题。
如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。
发表评论 取消回复