我是靠谱客的博主 忧虑大碗,最近开发中收集的这篇文章主要介绍产品组合创新性对点击量的python爬虫实现1、选取电商零售企业的python代码2、对直播情况进行爬取的python代码3、对直播中产品信息的爬虫代码,觉得挺不错的,现在分享给大家,希望可以做个参考。
概述
1、选取电商零售企业的python代码
import requests
import json
import os
import random
import time
import pandas as pd
def sleep_func(x, y):
"""
sleep方法, 单账号抓取过快, 一段时间内请求会失败
"""
sleep_time = random.choice(range(x, y)) * 0.1
time.sleep(sleep_time)
def get_live(lv, si, se):
# 近30天的直播记录
first = r'https://api.zhigua.cn/v1/Anchor/GetLiveAnchorVideoRecord?pageIndex='
middle1 = r'&pageSize=10&liveAnchorId='
middle2 = r'&sign='
middle3 = r'&seckey='
last = r'&isAuction=-1&range=2&_=1617936262247' # range调节近多少天
ti = 1
while 1:
sleep_func(10, 50)
url = first + str(ti) + middle1 + lv + middle2 + si + middle3 + se + last
r = requests.get(url, headers=headers, timeout=30)
r.raise_for_status()
r.encoding = 'utf-8'
rt = json.loads(r.text, strict=False)
info1 = rt['Data']["ItemList"]
for xt in info1:
livestream = {"主播名称": name, "ID": RoomNumber, "直播标题": xt["Title"], "直播标签": xt["Tags"],
"直播开始时间": xt["StartTime"], "直播结束时间": xt["EndTime"], "持续时长(秒)": xt["LiveDuration"],
"场观UV(估)": xt["TotalJoinCount"], "PV": xt["ViewCount"], "直播涨粉": xt["FanGrowth"],
"商品数": xt["GoodsCount"], "客单价(估)":xt["SalePrice"],
"带货销量(估)": xt["SaleCount"], "带货GMV(估)": xt["SaleAmount"]}
livestreams.append(livestream)
ti += 1
c = rt["Data"]["TotalCount"]
count = c // 10 + 1
if count < ti:
break
def get_store(lv, si, se):
# 关联店铺信息
first = r'https://api.zhigua.cn/v1/AnchorSellerAnalysis/SellerRank?pageSize=10&pageIndex='
middle1 = r'&liveAnchorId='
middle2 = r'&sign='
middle3 = r'&seckey='
last = r'&range=2&_=1617938250625' # range调节近多少天 ,目前是近30天
ti = 1
while 1:
sleep_func(10, 50)
url = first + str(ti) + middle1 + lv + middle2 + si + middle3 + se + last
r = requests.get(url, headers=headers, timeout=30)
r.raise_for_status()
r.encoding = 'utf-8'
rt = json.loads(r.text, strict=False)
info1 = rt['Data']["ItemList"]
for xt in info1:
store = {"主播名称": name, "ID": RoomNumber, '店铺名称': xt["ShopName"], "店铺ID": xt["SellerId"],
"店铺等级": xt["CreditLevel"], "合作商品(个数)": xt["TBItemIdCount"], "店铺带货客单价": xt["TransAmount"],
"店铺带货销量": xt["SaleCount"], "店铺带货GMV": xt["SaleAmount"],
"涉及直播次数": xt["LiveCount"], "曝光度": xt["Rate"]}
stores.append(store)
ti += 1
c = rt["Data"]["TotalCount"]
count = c // 10 + 1
if count < ti:
break
def get_anchor(lv, si):
# 主播基本信息网址
f2 = r'https://api.zhigua.cn/v1/anchor/GetAnchorByID?liveAnchorId='
m2 = r'&sign='
l2 = r'&_=1614490065737'
url2 = f2 + lv + m2 + si + l2
sleep_func(10, 50)
r2 = requests.get(url2, headers=headers, timeout=30)
r2.raise_for_status()
r2.encoding = 'utf-8'
rt2 = json.loads(r2.text, strict=False)
x = rt2['Data']
anchor = {"主播名称": name, "ID": RoomNumber, "地区": x["Location"], "主播类型": x["AnchorTypeName"],
"所属行业": x["CategoryName"], "擅长领域": x["TBBroadCasterField"], "粉丝数": x["FansCount"],
"最爱TA": x["TBFollowTopCount"], "开播场次": x["LiveCountIncluded"], "场均点赞": x["AvgLike"],
"场均评论": x["AvgComment"]}
anchors.append(anchor)
SecKey = str(x["SecKey"])
get_live(lv, si, SecKey)
get_store(lv, si, SecKey)
anchors = [] # 商家基本信息
ranks = []# 排行榜
livestreams = [] # 直播记录
stores = [] # 关联店铺
headers = {"User-Agent": r"Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
r"Chrome/80.0.3987.87 Safari/537.36",
"Cookie": "_data_chl=key=baidu-zhiguaci-pinpaici; Hm_lvt_bb59088c73528f1463ed89952b5c7e64=1617772534,1617877140,1617933065,1617952954; Hm_lpvt_bb59088c73528f1463ed89952b5c7e64=1617952954; Qs_lvt_340242=1617622644%2C1617772534%2C1617877140%2C1617933065%2C1617952954; Qs_pv_340242=2662092684555760000%2C3352647256333484500%2C4609210324117199000%2C4313385324937449000%2C940474279766886100; User=UserId=cb40c50b8688818f&Password=7b0de2266359c1123669e550c7b39ab9&ChildId=3420&ts=1618039548&acid=-1238459688; User_Lower=UserId=cb40c50b8688818f&Password=7b0de2266359c1123669e550c7b39ab9&ChildId=3420&ts=1618039548&acid=-1238459688"}
# 淘宝周榜榜单网址 3.29-4.4
f1 = r'https://api.zhigua.cn/v1/rank/GetLiveAnchorMarketRank?pageIndex='
l1 = r'&pageSize=10&dateCode=20210404&period=7&CategoryId=7&subType=3&_=1617935527412'
os.chdir(r'C:UsersDesktop数据淘宝店铺店铺基本信息')
for i in range(1, 9):
sleep_func(10, 50)
if i % 20 == 0:
sleep_func(150, 300)
url1 = f1 + str(i) + l1
r1 = requests.get(url1, headers=headers, timeout=30)
r1.raise_for_status()
r1.encoding = 'utf-8'
rt1 = json.loads(r1.text, strict=False)
info = rt1["Data"]["ItemList"]
for n in info:
AnchorId = str(n["LiveAnchorId"])
sign = str(n["IdKey"])
name = n["Nick"]
RoomNumber = n["RoomNumber"]
rank = {"主播名称": name, "ID": RoomNumber, "知瓜指数": n["Score"], "粉丝数": n["FansCount"],
"点赞数": n["LikeCount"], "观看次数": n["ViewCount"], "直播时长": n["LiveDuration"]}
ranks.append(rank)
get_anchor(AnchorId, sign)
print(name)
print(i)
data1 = pd.read_excel('主播基本信息.xlsx')
data2 = pd.read_excel('店铺排行信息.xlsx')
data3 = pd.read_excel('近30天直播信息.xlsx')
data4 = pd.read_excel('近30天关联店铺信息.xlsx')
2、对直播情况进行爬取的python代码
import requests
import json
import os
import random
import time
import pandas as pd
def sleep_func(x, y):
"""
sleep方法, 单账号抓取过快, 一段时间内请求会失败
"""
sleep_time = random.choice(range(x, y)) * 0.1
time.sleep(sleep_time)
def get_anchor(lv, si):
# 主播基本信息网址
first = r'https://api.zhigua.cn/v1/anchor/GetAnchorByID?liveAnchorId='
middle = r'&sign='
last = r'&_=1614490065737'
url = first + lv + middle + si + last
r = requests.get(url, headers=headers, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
rt = json.loads(r.text, strict=False)
x = rt['Data']
return str(x["SecKey"])
def get_brand_category(br, idk, dc, sk, lva):
befo = r'https://api.zhigua.cn/v1/Live/GetGoodsDistributedInfo?liveVideoId='
mid1 = r'&sign='
mid2 = r'&dateCode='
mid3 = r'&seckey='
mid4 = r'&PreStatus=0&liveAnchorId='
la = r'&_=1618061169415'
url = befo + br + mid1 + idk + mid2 + dc + mid3 + sk + mid4 + lva + la
r = requests.get(url, headers=headers, timeout=30)
r.raise_for_status()
r.encoding = 'utf-8'
rt = json.loads(r.text, strict=False)
price = rt['Data']["Price"]
bran = rt['Data']["Category"]
category = {"主播名称": name, "ID": RoomNumber, "直播标题": Title, "直播标签": Tags, "直播开始时间": StartTime}
for xt in price:
category["价格区间" + str(xt["Title"])] = xt["Value"]
for yt in bran:
category[yt["Title"]] = yt["Value"]
categorys.append(category)
def get_clock(br, idk, dc, sk):
befo = r'https://api.zhigua.cn/v1/Live/GetLiveVideoStatByTimeLines?liveVideoId='
mid1 = r'&sign='
mid2 = r'&dateCode='
mid3 = r'&seckey='
la = r'&dataType=6&_=1618058876809'
url = befo + br + mid1 + idk + mid2 + dc + mid3 + sk + la
r = requests.get(url, headers=headers, timeout=30)
r.raise_for_status()
r.encoding = 'utf-8'
rt = json.loads(r.text, strict=False)
info = rt['Data']
clock_info = {"主播名称": name, "ID": RoomNumber, "直播标题": Title, "直播标签": Tags, "直播开始时间": StartTime,
"开始统计时段": info[0]["Time"]}
clock_amount = {"主播名称": name, "ID": RoomNumber, "直播标题": Title, "直播标签": Tags, "直播开始时间": StartTime,
"开始统计时段": info[0]["Time"]}
for xt in info:
MomentTag = str(xt["MomentTag"])
clock_info["时间段" + MomentTag] = xt["Value"]["GoBuyCount"]
clock_amount["时间段" + MomentTag] = xt["Value"]["GoBuyAmount"]
clock_infos.append(clock_info)
clock_amounts.append(clock_amount)
# 总函数
def get_live(lv_Id, si, date):
befo = r'https://api.zhigua.cn/v1/Live/GetLive?liveVideoId='
midd1 = r'&sign='
midd2 = r'&dateCode='
la = r'&_=1614079673832'
urlt = befo + lv_Id + midd1 + si + midd2 + str(date) + la
ra = requests.get(urlt, headers=headers, timeout=30)
ra.raise_for_status()
ra.encoding = 'utf-8'
rat = json.loads(ra.text, strict=False)
inf = rat['Data']
StartTime = inf['StartTime']
livestream = {"主播名称": name, "ID": RoomNumber, "直播开始时间": StartTime,"直播标题": Title, "直播标签": Tags,
"直播结束时间": inf['EndTime'], "持续时长(秒)": inf['LiveDuration'],
"观看UV(估)": inf["TotalJoinCount"], "观看PV": inf["ViewCount"], "点赞数": inf["LikeCount"],
"评论弹幕数": inf["TextMsgCount"], "进店人数": inf["GoBuyCount"], "增粉数": inf["FanGrowth"], "客单价(估)": inf["UnitPrice"],
'实际成交额(AI)' : inf["DiscountSaleAmount"], "平均坑产": inf["AvgPitPrice"], "用户分享": inf["LiveShareCount"]}
livestreams.append(livestream)
SecKey1 = str(inf['SecKey'])
sleep_func(10, 50)
get_clock(lv_Id, si, date, SecKey1)
sleep_func(10, 50)
LiveAnchorId = str(inf["LiveAnchorId"])
sleep_func(10, 50)
get_brand_category(lv_Id, si, date, SecKey1, LiveAnchorId)
# 淘宝周榜榜单网址 3.29-4.4
f1 = r'https://api.zhigua.cn/v1/rank/GetLiveAnchorMarketRank?pageIndex='
l1 = r'&pageSize=10&dateCode=20210404&period=7&CategoryId=7&subType=3&_=1617935527412'
os.chdir(r'C:UsersDesktop数据淘宝店铺直播数据')
data = pd.read_excel('近30天直播信息-筛选.xlsx')
anchors = list(set(data['主播名称'].tolist()))
livestreams = [] # 直播信息概览
clock_infos = [] # 进店人数
clock_amounts = [] # 进店次数
categorys = [] # 分类
# 原数据
data2 = pd.read_excel(r'直播信息概览.xlsx')
data3 = pd.read_excel(r'进店人数.xlsx')
data4 = pd.read_excel(r'进店次数.xlsx')
data5 = pd.read_excel(r'商品分类分布.xlsx')
finished_anchors = list(set(data2["主播名称"].tolist()))
last_anchor = data2.loc[len(data2)-1, "主播名称"]
finished_anchors.remove(last_anchor)
anchoring = []
for anchor in anchors:
if anchor not in finished_anchors:
anchoring.append(anchor)
headers = {"User-Agent": r"Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
r"Chrome/80.0.3987.87 Safari/537.36",
"Cookie": "_data_chl=key=baidu-zhiguaci-pinpaici; Qs_lvt_340242=1618104837%2C1618124856%2C1618143092%2C1618150086%2C1618188237; Qs_pv_340242=2600672622879513600%2C2771621366381835000%2C3687495452173190000%2C263328052567666940%2C4554603154311060000; Hm_lvt_bb59088c73528f1463ed89952b5c7e64=1618124855,1618143079,1618150075,1618188245; Hm_lpvt_bb59088c73528f1463ed89952b5c7e64=1618188245; User=UserId=cb40c50b8688818f&Password=f15767c9a0610155552022fd99e22ccd&ChildId=2628&ts=1618274717&acid=-360711811; User_Lower=UserId=cb40c50b8688818f&Password=f15767c9a0610155552022fd99e22ccd&ChildId=2628&ts=1618274717&acid=-360711811"}
# 直播信息,近60天
f2 = r'https://api.zhigua.cn/v1/Anchor/GetLiveAnchorVideoRecord?pageIndex='
m1 = r'&pageSize=10&liveAnchorId='
m2 = r'&sign='
m3 = r'&seckey='
l2 = r'&isAuction=-1&range=3&_=1618058414489'
for n in range(1, 11):
sleep_func(10, 150)
url1 = f1 + str(n) + l1
r1 = requests.get(url1, headers=headers, timeout=30)
r1.raise_for_status()
r1.encoding = 'utf-8'
rt1 = json.loads(r1.text, strict=False)
info1 = rt1["Data"]["ItemList"]
for si in info1:
name = si["Nick"]
# 注意这个anchoring的更改
if name in anchoring:
AnchorId = str(si["LiveAnchorId"])
sign = str(si["IdKey"])
RoomNumber = si["RoomNumber"]
print(name)
SecKey = get_anchor(AnchorId, sign)
sleep_func(100, 350)
lives = data.loc[data['主播名称'] == name]
times = lives['直播开始时间'].tolist()
if name == last_anchor:
finished_lives = data2.loc[data2['主播名称'] == name]
finished_times = finished_lives['直播开始时间'].tolist()
for ft in finished_times:
if ft in times:
times.remove(ft)
ti = 1
while 1:
url2 = f2 + str(ti) + m1 + AnchorId + m2 + sign + m3 + SecKey +l2
r2 = requests.get(url2, headers=headers, timeout=30)
r2.raise_for_status()
r2.encoding = 'utf-8'
rt2 = json.loads(r2.text, strict=False)
info2 = rt2['Data']["ItemList"]
for i in info2:
StartTime = i['StartTime']
if StartTime in times:
DateCode = i['DateCode']
Tags = i['Tags']
Title = i['Title']
LiveVideoId = str(i['LiveVideoId'])
IdKey = str(i['IdKey'])
print(Title)
sleep_func(50, 200)
get_live(LiveVideoId, IdKey, str(DateCode))
ti += 1
ci = rt2["Data"]["TotalCount"]
countt = ci // 10 + 1
if countt < ti:
break
df2 = pd.DataFrame(livestreams)
df2 = pd.concat([data2, df2])
df2.to_excel('直播信息概览.xlsx', index=False)
df3 = pd.DataFrame(clock_infos)
df3 = pd.concat([data3, df3])
df3.to_excel('进店人数.xlsx', index=False)
df4 = pd.DataFrame(clock_amounts)
df4 = pd.concat([data4, df4])
df4.to_excel('进店次数.xlsx', index=False)
df5 = pd.DataFrame(categorys)
df5 = pd.concat([data5, df5])
df5.to_excel('商品分类分布.xlsx', index=False)
3、对直播中产品信息的爬虫代码
import requests
import json
import os
import random
import time
import pandas as pd
def sleep_func(x, y):
"""
sleep方法, 单账号抓取过快, 一段时间内请求会失败
"""
sleep_time = random.choice(range(x, y)) * 0.1
time.sleep(sleep_time)
def get_anchor(lv, si):
# 主播基本信息网址
first = r'https://api.zhigua.cn/v1/anchor/GetAnchorByID?liveAnchorId='
middle = r'&sign='
last = r'&_=1614490065737'
url = first + lv + middle + si + last
r = requests.get(url, headers=headers, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
rt = json.loads(r.text, strict=False)
x = rt['Data']
return str(x["SecKey"])
def tim(bst, st1):
by = time.mktime(time.strptime(bst,"%Y-%m-%d %H:%M:%S")) # 本场直播时间
sy = time.mktime(time.strptime(st1,"%Y-%m-%d %H:%M:%S")) # 最早一场直播时间
stc = by - sy
if stc < 0:
return -1 # 有问题
elif stc == 0:
return 1 # 是新品
elif stc > 0 and stc <= 2592000:
return 0 # 肯定不是新品
else:
return 2 # 不是新品,但还需要判断在一个月的时间段中是否为新品
def get_new(br, idk, dc, sk, tbi):
first = r'https://api.zhigua.cn/v1/Live/GoodsRelatedLives?pageIndex='
middl1 = r'&pageSize=10&liveVideoId='
middl2 = r'&sign='
middl3 = r'&dateCode='
middl4 = r'&seckey='
middl5 = r'&PreStatus=0&TBItemId='
la1 = r'&sort=4&range=2&_=1618196432155' # 近30天
la2 = r'&sort=4&range=3&_=1618196494073' # 近60天
sleep_func(10, 50)
urlf1 = first + str(1) + middl1 + br + middl2 + idk + middl3 + dc + middl4 + sk + middl5 + tbi + la1 # 近30天
rf1 = requests.get(urlf1, headers=headers, timeout=30)
rf1.raise_for_status()
rf1.encoding = 'utf-8'
rft1 = json.loads(rf1.text, strict=False)
infof1 = rft1['Data']["ItemList"]
stf1 = infof1[0]["StartTime"] # 计数最长的开始日期
new_good1 = tim(StartTime, stf1)
if new_good1 <= 0:
return new_good1
elif new_good1 == 1:
sleep_func(10, 50)
urlf2 = first + str(1) + middl1 + br + middl2 + idk + middl3 + dc + middl4 + sk + middl5 + tbi + la2 # 近60天
rf2 = requests.get(urlf2, headers=headers, timeout=30)
rf2.raise_for_status()
rf2.encoding = 'utf-8'
rft2 = json.loads(rf2.text, strict=False)
infof2 = rft2['Data']["ItemList"]
stf2 = infof2[0]["StartTime"] # 计数最长的开始日期
new_good2 = tim(StartTime, stf2)
if new_good2 <= 1:
return new_good2
else:
for f2 in infof2:
stf = f2["StartTime"]
new_goodf = tim(StartTime, stf)
if new_goodf <= 1:
break
if new_goodf <= 1:
return new_goodf
else:
cf = rft2["Data"]["TotalCount"]
nfi = 2
while 1:
sleep_func(10, 50)
urlf2 = first + str(nfi) + middl1 + br + middl2 + idk + middl3 + dc + middl4 + sk + middl5 + tbi + la2 # 近60天
rf2 = requests.get(urlf2, headers=headers, timeout=30)
rf2.raise_for_status()
rf2.encoding = 'utf-8'
rft2 = json.loads(rf2.text, strict=False)
infof2 = rft2['Data']["ItemList"]
for f2 in infof2:
stf = f2["StartTime"]
new_goodf = tim(StartTime, stf)
if new_goodf <= 1:
break
if new_goodf <= 1:
break
nfi += 1
countf = cf // 10 + 1
if countf < nfi:
break
return new_goodf
def get_goods(br, idk, dc, sk, ni):
# 商品链接
before = r'https://api.zhigua.cn/v1/Live/GetGoodsInLiveVideo?pageIndex='
middle1 = r'&pageSize=10&liveVideoId='
middle2 = r'&sign='
middle3 = r'&dateCode='
middle4 = r'&seckey='
last = r'&sort=10&_=1618061169684'
# 关联直播
bef = r'https://api.zhigua.cn/v1/Live/GoodsRelatedLives?pageIndex='
las1 = r'&PreStatus=0&TBItemId='
las2 =r'&sort=4&range=3&_=1618196494073' # 近60天
while 1:
sleep_func(150, 300)
if ni % 8 == 0:
sleep_func(350, 600)
url = before + str(ni) + middle1 + br + middle2 + idk + middle3 + dc + middle4 + sk + last
r = requests.get(url, headers=headers, timeout=30)
r.raise_for_status()
r.encoding = 'utf-8'
rt = json.loads(r.text, strict=False)
info = rt['Data']["ItemList"]
for xt in info:
goodInform= {"主播名称": name, "ID": RoomNumber, "直播标题": Title, "直播标签": Tags, "直播开始时间": StartTime,
"商品序号": xt["GoodsIndex"], "商品名称": xt["Title"], "品牌": xt["BrandName"], "价格": xt["Price"],
"上架时间": xt["StartTime"], "讲解开始": xt["ExplainTime"], "讲解结束": xt["ExplainEndTime"],
"营销词": xt["ItemRights"], "直播间同播": xt["SameTimeLiveCount"],
"直播销量": xt["SaleCount"], "关联播主": xt["AnchorCount"], "关联直播": xt["LiveCount"]}
TBItemId = str(xt["TBItemId"])
sleep_func(10, 50)
urlx = bef + str(1) + middle1 + br + middle2 + idk + middle3 + dc + middle4 + sk + las1 + TBItemId + las2
rx = requests.get(urlx, headers=headers, timeout=30)
rx.raise_for_status()
rx.encoding = 'utf-8'
rxt = json.loads(rx.text, strict=False)
infox = rxt['Data']["ItemList"]
st = infox[0]["StartTime"] # 计数最长的开始日期
new_good = tim(StartTime, st)
goodInform['是否新品'] = new_good
month_new = new_good
if month_new == 2:
for f1 in infox:
stf = f1["StartTime"]
new_goodf = tim(StartTime, stf)
if new_goodf <= 1:
month_new = new_goodf
break
if month_new > 1:
cf = rxt["Data"]["TotalCount"]
nfi = 2
while 1:
sleep_func(10, 50)
if nfi % 8 == 0:
sleep_func(100, 250)
if nfi & 20 == 0:
sleep_func(300, 550)
urlf2 = bef + str(nfi) + middle1 + br + middle2 + idk + middle3 + dc + middle4 + sk + las1 + TBItemId + las2 # 近60天
rf2 = requests.get(urlf2, headers=headers, timeout=30)
rf2.raise_for_status()
rf2.encoding = 'utf-8'
rft2 = json.loads(rf2.text, strict=False)
infof2 = rft2['Data']["ItemList"]
for f2 in infof2:
stf = f2["StartTime"]
new_goodf = tim(StartTime, stf)
if new_goodf == 2:
month_new = 1
if new_goodf <= 1:
month_new = new_goodf
break
if new_goodf <= 1:
break
nfi += 1
countf = cf // 10 + 1
if countf < nfi:
break
goodInform['近30天是否新品'] = month_new
goods.append(goodInform)
print(ni)
ni += 1
c = rt["Data"]["TotalCount"]
count = c // 10 + 1
if count < ni:
break
# 总函数
def get_live(lv_Id, si, date):
befo = r'https://api.zhigua.cn/v1/Live/GetLive?liveVideoId='
midd1 = r'&sign='
midd2 = r'&dateCode='
la = r'&_=1614079673832'
urlt = befo + lv_Id + midd1 + si + midd2 + str(date) + la
ra = requests.get(urlt, headers=headers, timeout=30)
ra.raise_for_status()
ra.encoding = 'utf-8'
rat = json.loads(ra.text, strict=False)
inf = rat['Data']
SecKey1 = str(inf['SecKey'])
sleep_func(10, 50)
get_goods(lv_Id, si, date, SecKey1)
# 淘宝周榜榜单网址 3.29-4.4
f1 = r'https://api.zhigua.cn/v1/rank/GetLiveAnchorMarketRank?pageIndex='
l1 = r'&pageSize=10&dateCode=20210404&period=7&CategoryId=7&subType=3&_=1617935527412'
os.chdir(r'C:UsersDesktop数据淘宝店铺直播数据')
data = pd.read_excel('近30天直播信息-筛选.xlsx')
anchors = list(set(data['主播名称'].tolist()))
goods = [] # 商品基本信息
# 原数据
data1 = pd.read_excel(r'商品基本信息.xlsx')
finished_anchors = list(set(data1["主播名称"].tolist()))
last_anchor = data1.loc[len(data1)-1, "主播名称"]
finished_anchors.remove(last_anchor)
anchoring = []
for anchor in anchors:
if anchor not in finished_anchors:
anchoring.append(anchor)
headers = {"User-Agent": r"Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
r"Chrome/80.0.3987.87 Safari/537.36",
"Cookie": "_data_chl=key=baidu-zhiguaci-pinpaici; Qs_lvt_340242=1618124856%2C1618143092%2C1618150086%2C1618188237%2C1618222763; Qs_pv_340242=2771621366381835000%2C3687495452173190000%2C263328052567666940%2C4554603154311060000%2C1365786379582669000; User=UserId=cb40c50b8688818f&Password=f15767c9a0610155552022fd99e22ccd&ChildId=2628&ts=1618309153&acid=-585937397; User_Lower=UserId=cb40c50b8688818f&Password=f15767c9a0610155552022fd99e22ccd&ChildId=2628&ts=1618309153&acid=-585937397; Hm_lvt_bb59088c73528f1463ed89952b5c7e64=1618143079,1618150075,1618188245,1618222771; Hm_lpvt_bb59088c73528f1463ed89952b5c7e64=1618222771"}
# 直播信息,近60天
f2 = r'https://api.zhigua.cn/v1/Anchor/GetLiveAnchorVideoRecord?pageIndex='
m1 = r'&pageSize=10&liveAnchorId='
m2 = r'&sign='
m3 = r'&seckey='
l2 = r'&isAuction=-1&range=3&_=1618058414489'
for n in range(1, 11):
sleep_func(10, 150)
url1 = f1 + str(n) + l1
r1 = requests.get(url1, headers=headers, timeout=30)
r1.raise_for_status()
r1.encoding = 'utf-8'
rt1 = json.loads(r1.text, strict=False)
info1 = rt1["Data"]["ItemList"]
for si in info1:
name = si["Nick"]
# 注意这个anchoring的更改
if name in anchors:
AnchorId = str(si["LiveAnchorId"])
sign = str(si["IdKey"])
RoomNumber = si["RoomNumber"]
print(name)
SecKey = get_anchor(AnchorId, sign)
sleep_func(100, 350)
lives = data.loc[data['主播名称'] == name]
times = lives['直播开始时间'].tolist()
if name == last_anchor:
finished_lives = data1.loc[data1['主播名称'] == name]
finished_times = list(set(finished_lives['直播开始时间'].tolist()))
for ft in finished_times:
if ft in times:
times.remove(ft)
ti = 1
while 1:
url2 = f2 + str(ti) + m1 + AnchorId + m2 + sign + m3 + SecKey +l2
r2 = requests.get(url2, headers=headers, timeout=30)
r2.raise_for_status()
r2.encoding = 'utf-8'
rt2 = json.loads(r2.text, strict=False)
info2 = rt2['Data']["ItemList"]
for i in info2:
StartTime = i['StartTime']
if StartTime in times:
DateCode = i['DateCode']
Tags = i['Tags']
Title = i['Title']
LiveVideoId = str(i['LiveVideoId'])
IdKey = str(i['IdKey'])
print(Title)
sleep_func(50, 200)
get_live(LiveVideoId, IdKey, str(DateCode))
ti += 1
ci = rt2["Data"]["TotalCount"]
countt = ci // 10 + 1
if countt < ti:
break
df1 = pd.DataFrame(goods)
df1 = pd.concat([data1, df1])
df1.to_excel('商品基本信息.xlsx', index=False)
最后
以上就是忧虑大碗为你收集整理的产品组合创新性对点击量的python爬虫实现1、选取电商零售企业的python代码2、对直播情况进行爬取的python代码3、对直播中产品信息的爬虫代码的全部内容,希望文章能够帮你解决产品组合创新性对点击量的python爬虫实现1、选取电商零售企业的python代码2、对直播情况进行爬取的python代码3、对直播中产品信息的爬虫代码所遇到的程序开发问题。
如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。
本图文内容来源于网友提供,作为学习参考使用,或来自网络收集整理,版权属于原作者所有。
发表评论 取消回复