我是靠谱客的博主 忧虑大碗,最近开发中收集的这篇文章主要介绍产品组合创新性对点击量的python爬虫实现1、选取电商零售企业的python代码2、对直播情况进行爬取的python代码3、对直播中产品信息的爬虫代码,觉得挺不错的,现在分享给大家,希望可以做个参考。

概述

1、选取电商零售企业的python代码

import requests
import json
import os
import random
import time
import pandas as pd

def sleep_func(x, y):
    """
    sleep方法, 单账号抓取过快, 一段时间内请求会失败
    """
    sleep_time = random.choice(range(x, y)) * 0.1
    time.sleep(sleep_time)

def get_live(lv, si, se):
    # 近30天的直播记录
    first = r'https://api.zhigua.cn/v1/Anchor/GetLiveAnchorVideoRecord?pageIndex='
    middle1 = r'&pageSize=10&liveAnchorId='
    middle2 = r'&sign='
    middle3 = r'&seckey='
    last = r'&isAuction=-1&range=2&_=1617936262247'  # range调节近多少天
    ti = 1
    while 1:
        sleep_func(10, 50)
        url = first + str(ti) + middle1 + lv + middle2 + si + middle3 + se + last

        r = requests.get(url, headers=headers, timeout=30)
        r.raise_for_status()
        r.encoding = 'utf-8'
        rt = json.loads(r.text, strict=False)
        info1 = rt['Data']["ItemList"]
        for xt in info1:
            livestream = {"主播名称": name, "ID": RoomNumber, "直播标题": xt["Title"], "直播标签": xt["Tags"],
                         "直播开始时间": xt["StartTime"], "直播结束时间": xt["EndTime"], "持续时长(秒)": xt["LiveDuration"],
                         "场观UV(估)": xt["TotalJoinCount"], "PV": xt["ViewCount"], "直播涨粉": xt["FanGrowth"],
                         "商品数": xt["GoodsCount"], "客单价(估)":xt["SalePrice"], 
                          "带货销量(估)": xt["SaleCount"], "带货GMV(估)": xt["SaleAmount"]}
            livestreams.append(livestream)
        ti += 1
        c = rt["Data"]["TotalCount"]
        count = c // 10 + 1
        if count < ti:
            break

def get_store(lv, si, se):
    # 关联店铺信息
    first = r'https://api.zhigua.cn/v1/AnchorSellerAnalysis/SellerRank?pageSize=10&pageIndex='
    middle1 = r'&liveAnchorId='
    middle2 = r'&sign='
    middle3 = r'&seckey='
    last = r'&range=2&_=1617938250625'  # range调节近多少天 ,目前是近30天
    ti = 1
    while 1:
        sleep_func(10, 50)
        url = first + str(ti) + middle1 + lv + middle2 + si + middle3 + se + last
        r = requests.get(url, headers=headers, timeout=30)
        r.raise_for_status()
        r.encoding = 'utf-8'
        rt = json.loads(r.text, strict=False)
        info1 = rt['Data']["ItemList"]
        for xt in info1:
            store = {"主播名称": name, "ID": RoomNumber, '店铺名称': xt["ShopName"], "店铺ID": xt["SellerId"],
                    "店铺等级": xt["CreditLevel"], "合作商品(个数)": xt["TBItemIdCount"], "店铺带货客单价": xt["TransAmount"],
                    "店铺带货销量": xt["SaleCount"], "店铺带货GMV": xt["SaleAmount"], 
                     "涉及直播次数": xt["LiveCount"], "曝光度": xt["Rate"]}
            stores.append(store)
        ti += 1
        c = rt["Data"]["TotalCount"]
        count = c // 10 + 1
        if count < ti:
            break

def get_anchor(lv, si):
    # 主播基本信息网址
    f2 = r'https://api.zhigua.cn/v1/anchor/GetAnchorByID?liveAnchorId='
    m2 = r'&sign='
    l2 = r'&_=1614490065737'    
    url2 = f2 + lv + m2 + si + l2
    sleep_func(10, 50)
    r2 = requests.get(url2, headers=headers, timeout=30)
    r2.raise_for_status()
    r2.encoding = 'utf-8'
    rt2 = json.loads(r2.text, strict=False)
    x = rt2['Data']
    anchor = {"主播名称": name, "ID": RoomNumber, "地区": x["Location"], "主播类型": x["AnchorTypeName"],
             "所属行业": x["CategoryName"], "擅长领域": x["TBBroadCasterField"], "粉丝数": x["FansCount"],
             "最爱TA": x["TBFollowTopCount"], "开播场次": x["LiveCountIncluded"], "场均点赞": x["AvgLike"],
             "场均评论": x["AvgComment"]}
    anchors.append(anchor)
    SecKey = str(x["SecKey"])
    
    get_live(lv, si, SecKey)
    get_store(lv, si, SecKey)

anchors = []  # 商家基本信息
ranks = []# 排行榜
livestreams = [] # 直播记录
stores = [] # 关联店铺

headers = {"User-Agent": r"Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                         r"Chrome/80.0.3987.87 Safari/537.36",
           "Cookie": "_data_chl=key=baidu-zhiguaci-pinpaici; Hm_lvt_bb59088c73528f1463ed89952b5c7e64=1617772534,1617877140,1617933065,1617952954; Hm_lpvt_bb59088c73528f1463ed89952b5c7e64=1617952954; Qs_lvt_340242=1617622644%2C1617772534%2C1617877140%2C1617933065%2C1617952954; Qs_pv_340242=2662092684555760000%2C3352647256333484500%2C4609210324117199000%2C4313385324937449000%2C940474279766886100; User=UserId=cb40c50b8688818f&Password=7b0de2266359c1123669e550c7b39ab9&ChildId=3420&ts=1618039548&acid=-1238459688; User_Lower=UserId=cb40c50b8688818f&Password=7b0de2266359c1123669e550c7b39ab9&ChildId=3420&ts=1618039548&acid=-1238459688"}

# 淘宝周榜榜单网址 3.29-4.4
f1 = r'https://api.zhigua.cn/v1/rank/GetLiveAnchorMarketRank?pageIndex='
l1 = r'&pageSize=10&dateCode=20210404&period=7&CategoryId=7&subType=3&_=1617935527412'

os.chdir(r'C:UsersDesktop数据淘宝店铺店铺基本信息')

for i in range(1, 9):
    sleep_func(10, 50)
    if i % 20 == 0:
        sleep_func(150, 300)
    url1 = f1 + str(i) + l1
    r1 = requests.get(url1, headers=headers, timeout=30)
    r1.raise_for_status()
    r1.encoding = 'utf-8'
    rt1 = json.loads(r1.text, strict=False)
    info = rt1["Data"]["ItemList"]
    for n in info:
        AnchorId = str(n["LiveAnchorId"])
        sign = str(n["IdKey"])
        name = n["Nick"]
        RoomNumber = n["RoomNumber"]
        rank = {"主播名称": name, "ID": RoomNumber, "知瓜指数": n["Score"], "粉丝数": n["FansCount"],
               "点赞数": n["LikeCount"], "观看次数": n["ViewCount"], "直播时长": n["LiveDuration"]}
        ranks.append(rank)
        get_anchor(AnchorId, sign)
        print(name)
    print(i)    

data1 = pd.read_excel('主播基本信息.xlsx')
data2 = pd.read_excel('店铺排行信息.xlsx')
data3 = pd.read_excel('近30天直播信息.xlsx')
data4 = pd.read_excel('近30天关联店铺信息.xlsx')

2、对直播情况进行爬取的python代码

import requests
import json
import os
import random
import time
import pandas as pd

def sleep_func(x, y):
    """
    sleep方法, 单账号抓取过快, 一段时间内请求会失败
    """
    sleep_time = random.choice(range(x, y)) * 0.1
    time.sleep(sleep_time)

def get_anchor(lv, si):
    # 主播基本信息网址
    first = r'https://api.zhigua.cn/v1/anchor/GetAnchorByID?liveAnchorId='
    middle = r'&sign='
    last = r'&_=1614490065737'
    url = first + lv + middle + si + last
    r = requests.get(url, headers=headers, timeout=30)
    r.raise_for_status()
    r.encoding = r.apparent_encoding
    rt = json.loads(r.text, strict=False)
    x = rt['Data']
    return str(x["SecKey"])

def get_brand_category(br, idk, dc, sk, lva):
    befo = r'https://api.zhigua.cn/v1/Live/GetGoodsDistributedInfo?liveVideoId='
    mid1 = r'&sign='
    mid2 = r'&dateCode='
    mid3 = r'&seckey='
    mid4 = r'&PreStatus=0&liveAnchorId='
    la = r'&_=1618061169415' 
    url = befo + br + mid1 + idk + mid2 + dc + mid3 + sk + mid4 + lva + la
    r = requests.get(url, headers=headers, timeout=30)
    r.raise_for_status()
    r.encoding = 'utf-8'
    rt = json.loads(r.text, strict=False)  
    price = rt['Data']["Price"]
    bran = rt['Data']["Category"]
    category = {"主播名称": name, "ID": RoomNumber, "直播标题": Title, "直播标签": Tags, "直播开始时间": StartTime}
    for xt in price:
        category["价格区间" + str(xt["Title"])] = xt["Value"]
    for yt in bran:
        category[yt["Title"]] = yt["Value"]
    categorys.append(category)

def get_clock(br, idk, dc, sk):
    befo = r'https://api.zhigua.cn/v1/Live/GetLiveVideoStatByTimeLines?liveVideoId='
    mid1 = r'&sign='
    mid2 = r'&dateCode='
    mid3 = r'&seckey='
    la = r'&dataType=6&_=1618058876809'
    url = befo + br + mid1 + idk + mid2 + dc + mid3 + sk + la
    r = requests.get(url, headers=headers, timeout=30)
    r.raise_for_status()
    r.encoding = 'utf-8'
    rt = json.loads(r.text, strict=False)
    info = rt['Data']
    clock_info = {"主播名称": name, "ID": RoomNumber, "直播标题": Title, "直播标签": Tags, "直播开始时间": StartTime, 
                  "开始统计时段": info[0]["Time"]}
    clock_amount = {"主播名称": name, "ID": RoomNumber, "直播标题": Title, "直播标签": Tags, "直播开始时间": StartTime, 
                    "开始统计时段": info[0]["Time"]}
    for xt in info:
        MomentTag = str(xt["MomentTag"])
        clock_info["时间段" + MomentTag] = xt["Value"]["GoBuyCount"]
        clock_amount["时间段" + MomentTag] = xt["Value"]["GoBuyAmount"]
    clock_infos.append(clock_info)
    clock_amounts.append(clock_amount)

# 总函数
def get_live(lv_Id, si, date):
    befo = r'https://api.zhigua.cn/v1/Live/GetLive?liveVideoId='
    midd1 = r'&sign='
    midd2 = r'&dateCode='
    la = r'&_=1614079673832'
    urlt = befo + lv_Id + midd1 + si + midd2 + str(date) + la
    ra = requests.get(urlt, headers=headers, timeout=30)
    ra.raise_for_status()
    ra.encoding = 'utf-8'
    rat = json.loads(ra.text, strict=False)
    inf = rat['Data']
    StartTime = inf['StartTime']

    livestream = {"主播名称": name, "ID": RoomNumber, "直播开始时间": StartTime,"直播标题": Title, "直播标签": Tags, 
         "直播结束时间": inf['EndTime'], "持续时长(秒)": inf['LiveDuration'],
       "观看UV(估)": inf["TotalJoinCount"], "观看PV": inf["ViewCount"], "点赞数": inf["LikeCount"],
       "评论弹幕数": inf["TextMsgCount"], "进店人数": inf["GoBuyCount"], "增粉数": inf["FanGrowth"], "客单价(估)": inf["UnitPrice"],
       '实际成交额(AI)' : inf["DiscountSaleAmount"], "平均坑产": inf["AvgPitPrice"], "用户分享": inf["LiveShareCount"]}
    livestreams.append(livestream)
    
    SecKey1 = str(inf['SecKey']) 
    sleep_func(10, 50)
    get_clock(lv_Id, si, date, SecKey1)
    sleep_func(10, 50)
    LiveAnchorId = str(inf["LiveAnchorId"])
    sleep_func(10, 50)
    get_brand_category(lv_Id, si, date, SecKey1, LiveAnchorId)

# 淘宝周榜榜单网址 3.29-4.4
f1 = r'https://api.zhigua.cn/v1/rank/GetLiveAnchorMarketRank?pageIndex='
l1 = r'&pageSize=10&dateCode=20210404&period=7&CategoryId=7&subType=3&_=1617935527412'

os.chdir(r'C:UsersDesktop数据淘宝店铺直播数据')

data = pd.read_excel('近30天直播信息-筛选.xlsx')
anchors = list(set(data['主播名称'].tolist()))

livestreams = []  # 直播信息概览
clock_infos = [] # 进店人数
clock_amounts = [] # 进店次数
categorys = [] # 分类

# 原数据
data2 = pd.read_excel(r'直播信息概览.xlsx')
data3 = pd.read_excel(r'进店人数.xlsx')
data4 = pd.read_excel(r'进店次数.xlsx')
data5 = pd.read_excel(r'商品分类分布.xlsx')

finished_anchors = list(set(data2["主播名称"].tolist()))
last_anchor = data2.loc[len(data2)-1, "主播名称"]
finished_anchors.remove(last_anchor)

anchoring = []
for anchor in anchors:
    if anchor not in finished_anchors:
        anchoring.append(anchor)

headers = {"User-Agent": r"Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                         r"Chrome/80.0.3987.87 Safari/537.36",
           "Cookie": "_data_chl=key=baidu-zhiguaci-pinpaici; Qs_lvt_340242=1618104837%2C1618124856%2C1618143092%2C1618150086%2C1618188237; Qs_pv_340242=2600672622879513600%2C2771621366381835000%2C3687495452173190000%2C263328052567666940%2C4554603154311060000; Hm_lvt_bb59088c73528f1463ed89952b5c7e64=1618124855,1618143079,1618150075,1618188245; Hm_lpvt_bb59088c73528f1463ed89952b5c7e64=1618188245; User=UserId=cb40c50b8688818f&Password=f15767c9a0610155552022fd99e22ccd&ChildId=2628&ts=1618274717&acid=-360711811; User_Lower=UserId=cb40c50b8688818f&Password=f15767c9a0610155552022fd99e22ccd&ChildId=2628&ts=1618274717&acid=-360711811"}

# 直播信息,近60天
f2 = r'https://api.zhigua.cn/v1/Anchor/GetLiveAnchorVideoRecord?pageIndex='
m1 = r'&pageSize=10&liveAnchorId='
m2 = r'&sign='
m3 = r'&seckey='
l2 = r'&isAuction=-1&range=3&_=1618058414489'

for n in range(1, 11):
    sleep_func(10, 150)
    url1 = f1 + str(n) + l1

    r1 = requests.get(url1, headers=headers, timeout=30)
    r1.raise_for_status()
    r1.encoding = 'utf-8'
    rt1 = json.loads(r1.text, strict=False)
    info1 = rt1["Data"]["ItemList"]
    for si in info1:
        name = si["Nick"]
        # 注意这个anchoring的更改
        if name in anchoring:
            AnchorId = str(si["LiveAnchorId"])
            sign = str(si["IdKey"])
            RoomNumber = si["RoomNumber"]
            print(name)
            SecKey = get_anchor(AnchorId, sign)
            sleep_func(100, 350)

            lives = data.loc[data['主播名称'] == name]
            times = lives['直播开始时间'].tolist()
            if name == last_anchor:
                finished_lives = data2.loc[data2['主播名称'] == name]
                finished_times = finished_lives['直播开始时间'].tolist()
                for ft in finished_times:
                    if ft in times:
                        times.remove(ft)
                
            ti = 1
            while 1:
                url2 = f2 + str(ti) + m1 + AnchorId + m2 + sign + m3 + SecKey +l2
                r2 = requests.get(url2, headers=headers, timeout=30)
                r2.raise_for_status()
                r2.encoding = 'utf-8'
                rt2 = json.loads(r2.text, strict=False)
                info2 = rt2['Data']["ItemList"]

                for i in info2:
                    StartTime = i['StartTime']
                    if StartTime in times:
                        DateCode = i['DateCode']
                        Tags = i['Tags']
                        Title = i['Title']
                        LiveVideoId = str(i['LiveVideoId'])
                        IdKey = str(i['IdKey'])
                        print(Title)
                        sleep_func(50, 200)
                        get_live(LiveVideoId, IdKey, str(DateCode))
                
                ti += 1
                ci = rt2["Data"]["TotalCount"]
                countt = ci // 10 + 1
                if countt < ti:
                    break

df2 = pd.DataFrame(livestreams)
df2 = pd.concat([data2, df2])
df2.to_excel('直播信息概览.xlsx', index=False)

df3 = pd.DataFrame(clock_infos)
df3 = pd.concat([data3, df3])
df3.to_excel('进店人数.xlsx', index=False)

df4 = pd.DataFrame(clock_amounts)
df4 = pd.concat([data4, df4])
df4.to_excel('进店次数.xlsx', index=False)

df5 = pd.DataFrame(categorys)
df5 = pd.concat([data5, df5])
df5.to_excel('商品分类分布.xlsx', index=False)

3、对直播中产品信息的爬虫代码

import requests
import json
import os
import random
import time
import pandas as pd

def sleep_func(x, y):
    """
    sleep方法, 单账号抓取过快, 一段时间内请求会失败
    """
    sleep_time = random.choice(range(x, y)) * 0.1
    time.sleep(sleep_time)

def get_anchor(lv, si):
    # 主播基本信息网址
    first = r'https://api.zhigua.cn/v1/anchor/GetAnchorByID?liveAnchorId='
    middle = r'&sign='
    last = r'&_=1614490065737'
    url = first + lv + middle + si + last
    r = requests.get(url, headers=headers, timeout=30)
    r.raise_for_status()
    r.encoding = r.apparent_encoding
    rt = json.loads(r.text, strict=False)
    x = rt['Data']
    return str(x["SecKey"])

def tim(bst, st1):
    by = time.mktime(time.strptime(bst,"%Y-%m-%d %H:%M:%S")) # 本场直播时间
    sy = time.mktime(time.strptime(st1,"%Y-%m-%d %H:%M:%S")) # 最早一场直播时间
    stc = by - sy
    if stc < 0:
        return -1 # 有问题
    elif stc == 0:
        return 1 # 是新品
    elif stc > 0 and stc <= 2592000:
        return 0 # 肯定不是新品
    else:
        return 2 # 不是新品,但还需要判断在一个月的时间段中是否为新品

def get_new(br, idk, dc, sk, tbi):
    first = r'https://api.zhigua.cn/v1/Live/GoodsRelatedLives?pageIndex='
    middl1 = r'&pageSize=10&liveVideoId='
    middl2 = r'&sign='
    middl3 = r'&dateCode='
    middl4 = r'&seckey='
    middl5 = r'&PreStatus=0&TBItemId='
    la1 = r'&sort=4&range=2&_=1618196432155'  # 近30天
    la2 = r'&sort=4&range=3&_=1618196494073' # 近60天
    sleep_func(10, 50)
    urlf1 = first + str(1) + middl1 + br + middl2 + idk + middl3 + dc + middl4 + sk + middl5 + tbi + la1 # 近30天
    rf1 = requests.get(urlf1, headers=headers, timeout=30)
    rf1.raise_for_status()
    rf1.encoding = 'utf-8'
    rft1 = json.loads(rf1.text, strict=False)
    infof1 = rft1['Data']["ItemList"]
    stf1 = infof1[0]["StartTime"] # 计数最长的开始日期   
    new_good1 = tim(StartTime, stf1)
    if new_good1 <= 0:
        return new_good1
    elif new_good1 == 1:
        sleep_func(10, 50)
        urlf2 = first + str(1) + middl1 + br + middl2 + idk + middl3 + dc + middl4 + sk + middl5 + tbi + la2 # 近60天
        rf2 = requests.get(urlf2, headers=headers, timeout=30)
        rf2.raise_for_status()
        rf2.encoding = 'utf-8'
        rft2 = json.loads(rf2.text, strict=False)
        infof2 = rft2['Data']["ItemList"]
        stf2 = infof2[0]["StartTime"] # 计数最长的开始日期    
        new_good2 = tim(StartTime, stf2)
        if new_good2 <= 1:
            return new_good2
        else:
            for f2 in infof2:
                stf = f2["StartTime"]
                new_goodf = tim(StartTime, stf)
                if new_goodf <= 1:
                    break
            if new_goodf <= 1:
                return new_goodf
            else:
                cf = rft2["Data"]["TotalCount"]
                nfi = 2
                while 1:
                    sleep_func(10, 50)
                    urlf2 = first + str(nfi) + middl1 + br + middl2 + idk + middl3 + dc + middl4 + sk + middl5 + tbi + la2 # 近60天
                    rf2 = requests.get(urlf2, headers=headers, timeout=30)
                    rf2.raise_for_status()
                    rf2.encoding = 'utf-8'
                    rft2 = json.loads(rf2.text, strict=False)
                    infof2 = rft2['Data']["ItemList"]
                    for f2 in infof2:
                        stf = f2["StartTime"]
                        new_goodf = tim(StartTime, stf)
                        if new_goodf <= 1:
                            break
                    if new_goodf <= 1:
                        break   
                    nfi += 1
                    countf = cf // 10 + 1
                    if countf < nfi:
                        break  
                return new_goodf

def get_goods(br, idk, dc, sk, ni):
    # 商品链接
    before = r'https://api.zhigua.cn/v1/Live/GetGoodsInLiveVideo?pageIndex='
    middle1 = r'&pageSize=10&liveVideoId='
    middle2 = r'&sign='
    middle3 = r'&dateCode='
    middle4 = r'&seckey='
    last = r'&sort=10&_=1618061169684'

    # 关联直播
    bef = r'https://api.zhigua.cn/v1/Live/GoodsRelatedLives?pageIndex='
    las1 = r'&PreStatus=0&TBItemId='
    las2 =r'&sort=4&range=3&_=1618196494073' # 近60天
    while 1:
        sleep_func(150, 300)
        if ni % 8 == 0:
            sleep_func(350, 600)
        url = before + str(ni) + middle1 + br + middle2 + idk + middle3 + dc + middle4 + sk + last
        r = requests.get(url, headers=headers, timeout=30)
        r.raise_for_status()
        r.encoding = 'utf-8'
        rt = json.loads(r.text, strict=False)
        info = rt['Data']["ItemList"]
        for xt in info:
            goodInform= {"主播名称": name, "ID": RoomNumber, "直播标题": Title, "直播标签": Tags, "直播开始时间": StartTime,
                         "商品序号": xt["GoodsIndex"], "商品名称": xt["Title"], "品牌": xt["BrandName"], "价格": xt["Price"], 
                        "上架时间": xt["StartTime"], "讲解开始": xt["ExplainTime"], "讲解结束": xt["ExplainEndTime"],
                        "营销词": xt["ItemRights"], "直播间同播": xt["SameTimeLiveCount"],
                       "直播销量": xt["SaleCount"], "关联播主": xt["AnchorCount"], "关联直播": xt["LiveCount"]}
            TBItemId = str(xt["TBItemId"])
            sleep_func(10, 50)
            urlx = bef + str(1) + middle1 + br + middle2 + idk + middle3 + dc + middle4 + sk + las1 + TBItemId + las2
            rx = requests.get(urlx, headers=headers, timeout=30)
            rx.raise_for_status()
            rx.encoding = 'utf-8'
            rxt = json.loads(rx.text, strict=False)
            infox = rxt['Data']["ItemList"]
            st = infox[0]["StartTime"] # 计数最长的开始日期
            new_good = tim(StartTime, st)
            goodInform['是否新品'] = new_good
            month_new = new_good
            if month_new == 2:
                for f1 in infox:
                    stf = f1["StartTime"]
                    new_goodf = tim(StartTime, stf)
                    if new_goodf <= 1:
                        month_new = new_goodf
                        break
            if month_new > 1:
                cf = rxt["Data"]["TotalCount"]
                nfi = 2
                while 1:
                    sleep_func(10, 50)
                    if nfi % 8 == 0:
                        sleep_func(100, 250)
                    if nfi & 20 == 0:
                        sleep_func(300, 550)
                    urlf2 = bef + str(nfi) + middle1 + br + middle2 + idk + middle3 + dc + middle4 + sk + las1 + TBItemId + las2 # 近60天
                    rf2 = requests.get(urlf2, headers=headers, timeout=30)
                    rf2.raise_for_status()
                    rf2.encoding = 'utf-8'
                    rft2 = json.loads(rf2.text, strict=False)
                    infof2 = rft2['Data']["ItemList"]
                    for f2 in infof2:
                        stf = f2["StartTime"]
                        new_goodf = tim(StartTime, stf)
                        if new_goodf == 2:
                            month_new = 1
                        if new_goodf <= 1:
                            month_new = new_goodf
                            break
                    if new_goodf <= 1:
                        break   
                    nfi += 1
                    countf = cf // 10 + 1
                    if countf < nfi:
                        break  
             
            goodInform['近30天是否新品'] = month_new
            goods.append(goodInform)
        print(ni)
        ni += 1
        c = rt["Data"]["TotalCount"]
        count = c // 10 + 1
        if count < ni:
            break    

# 总函数
def get_live(lv_Id, si, date):
    befo = r'https://api.zhigua.cn/v1/Live/GetLive?liveVideoId='
    midd1 = r'&sign='
    midd2 = r'&dateCode='
    la = r'&_=1614079673832'
    urlt = befo + lv_Id + midd1 + si + midd2 + str(date) + la
    ra = requests.get(urlt, headers=headers, timeout=30)
    ra.raise_for_status()
    ra.encoding = 'utf-8'
    rat = json.loads(ra.text, strict=False)
    inf = rat['Data']
    
    SecKey1 = str(inf['SecKey']) 
    sleep_func(10, 50)
    get_goods(lv_Id, si, date, SecKey1)

# 淘宝周榜榜单网址 3.29-4.4
f1 = r'https://api.zhigua.cn/v1/rank/GetLiveAnchorMarketRank?pageIndex='
l1 = r'&pageSize=10&dateCode=20210404&period=7&CategoryId=7&subType=3&_=1617935527412'

os.chdir(r'C:UsersDesktop数据淘宝店铺直播数据')

data = pd.read_excel('近30天直播信息-筛选.xlsx')
anchors = list(set(data['主播名称'].tolist()))

goods = []   # 商品基本信息

# 原数据
data1 = pd.read_excel(r'商品基本信息.xlsx')

finished_anchors = list(set(data1["主播名称"].tolist()))
last_anchor = data1.loc[len(data1)-1, "主播名称"]
finished_anchors.remove(last_anchor)

anchoring = []
for anchor in anchors:
    if anchor not in finished_anchors:
        anchoring.append(anchor)

headers = {"User-Agent": r"Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                         r"Chrome/80.0.3987.87 Safari/537.36",
           "Cookie": "_data_chl=key=baidu-zhiguaci-pinpaici; Qs_lvt_340242=1618124856%2C1618143092%2C1618150086%2C1618188237%2C1618222763; Qs_pv_340242=2771621366381835000%2C3687495452173190000%2C263328052567666940%2C4554603154311060000%2C1365786379582669000; User=UserId=cb40c50b8688818f&Password=f15767c9a0610155552022fd99e22ccd&ChildId=2628&ts=1618309153&acid=-585937397; User_Lower=UserId=cb40c50b8688818f&Password=f15767c9a0610155552022fd99e22ccd&ChildId=2628&ts=1618309153&acid=-585937397; Hm_lvt_bb59088c73528f1463ed89952b5c7e64=1618143079,1618150075,1618188245,1618222771; Hm_lpvt_bb59088c73528f1463ed89952b5c7e64=1618222771"}

# 直播信息,近60天
f2 = r'https://api.zhigua.cn/v1/Anchor/GetLiveAnchorVideoRecord?pageIndex='
m1 = r'&pageSize=10&liveAnchorId='
m2 = r'&sign='
m3 = r'&seckey='
l2 = r'&isAuction=-1&range=3&_=1618058414489'

for n in range(1, 11):
    sleep_func(10, 150)
    url1 = f1 + str(n) + l1

    r1 = requests.get(url1, headers=headers, timeout=30)
    r1.raise_for_status()
    r1.encoding = 'utf-8'
    rt1 = json.loads(r1.text, strict=False)
    info1 = rt1["Data"]["ItemList"]
    for si in info1:
        name = si["Nick"]
        # 注意这个anchoring的更改
        if name in anchors:
            AnchorId = str(si["LiveAnchorId"])
            sign = str(si["IdKey"])
            RoomNumber = si["RoomNumber"]
            print(name)
            SecKey = get_anchor(AnchorId, sign)
            sleep_func(100, 350)

            lives = data.loc[data['主播名称'] == name]
            times = lives['直播开始时间'].tolist()
            
            if name == last_anchor:
                finished_lives = data1.loc[data1['主播名称'] == name]
                finished_times = list(set(finished_lives['直播开始时间'].tolist()))
                for ft in finished_times:
                    if ft in times:
                        times.remove(ft)
              
            ti = 1
            while 1:
                url2 = f2 + str(ti) + m1 + AnchorId + m2 + sign + m3 + SecKey +l2
                r2 = requests.get(url2, headers=headers, timeout=30)
                r2.raise_for_status()
                r2.encoding = 'utf-8'
                rt2 = json.loads(r2.text, strict=False)
                info2 = rt2['Data']["ItemList"]

                for i in info2:
                    StartTime = i['StartTime']
                    if StartTime in times:
                        DateCode = i['DateCode']
                        Tags = i['Tags']
                        Title = i['Title']
                        LiveVideoId = str(i['LiveVideoId'])
                        IdKey = str(i['IdKey'])
                        print(Title)
                        sleep_func(50, 200)
                        get_live(LiveVideoId, IdKey, str(DateCode))
                
                ti += 1
                ci = rt2["Data"]["TotalCount"]
                countt = ci // 10 + 1
                if countt < ti:
                    break

df1 = pd.DataFrame(goods)
df1 = pd.concat([data1, df1])
df1.to_excel('商品基本信息.xlsx', index=False)

最后

以上就是忧虑大碗为你收集整理的产品组合创新性对点击量的python爬虫实现1、选取电商零售企业的python代码2、对直播情况进行爬取的python代码3、对直播中产品信息的爬虫代码的全部内容,希望文章能够帮你解决产品组合创新性对点击量的python爬虫实现1、选取电商零售企业的python代码2、对直播情况进行爬取的python代码3、对直播中产品信息的爬虫代码所遇到的程序开发问题。

如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。

本图文内容来源于网友提供,作为学习参考使用,或来自网络收集整理,版权属于原作者所有。
点赞(34)

评论列表共有 0 条评论

立即
投稿
返回
顶部