功能:
实现指定B站UID作者所有视频封面爬取,并保存到本地folder中
代码:
from functools import reduce
import hashlib
import os
import time
import http.cookies
import time
import requests
import urllib.parse
x = 0
def parse_cookies(cookie_str):
cookies = {}
if cookie_str:
cookie = http.cookies.SimpleCookie(cookie_str)
for key, morsel in cookie.items():
cookies[key] = morsel.value
return cookies
def getMixinKey(ae):
oe = [46, 47, 18, 2, 53, 8, 23, 32, 15, 50, 10, 31, 58, 3, 45, 35, 27, 43, 5, 49, 33, 9, 42, 19, 29, 28, 14, 39, 12, 38, 41,
13, 37, 48, 7, 16, 24, 55, 40, 61, 26, 17, 0, 1, 60, 51, 30, 4, 22, 25, 54, 21, 56, 59, 6, 63, 57, 62, 11, 36, 20, 34, 44, 52]
le = reduce(lambda s, i: s + ae[i], oe, "")
return le[:32]
def getjson(url,headers=None):
cookie = ""
cookies = parse_cookies(cookie)
headers = {'Referer': 'https://www.bilibili.com/','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0'}
res = requests.get(url,headers=headers, cookies=cookies)
if res.status_code == 200:
json_data = res.json()
return json_data
else:
return None
def encWbi(params_in: dict):
params = params_in.copy()
print(f'params: {params}') # 输出params的值
resp = getjson("https://api.bilibili.com/x/web-interface/nav")
print(f'resp: {resp}') # 输出resp的值
wbi_img: dict = resp["data"]["wbi_img"]
print(f'wbi_img: {wbi_img}') # 输出wbi_img的值
me = getMixinKey(wbi_img['img_url'].split("/")[-1].split(".")[0] + wbi_img['sub_url'].split("/")[-1].split(".")[0])
print(f'me: {me}') # 输出me的值
wts = int(time.time())
print(f'wts: {wts}') # 输出wts的值
params["wts"] = wts
print(f'params after adding wts: {params}') # 输出添加wts后的params的值
params = dict(sorted(params.items()))
print(f'params after sorting: {params}') # 输出排序后的params的值
Ae = "&".join([f'{key}={value}' for key, value in params.items()])
print(f'Ae: {Ae}') # 输出Ae的值
w_rid = hashlib.md5((Ae + me).encode(encoding='utf-8')).hexdigest()
print(f'w_rid: {w_rid}') # 输出w_rid的值
return w_rid, wts
# 输入uid 返回投稿视频的字典列表
def getUpVideos(up_uid,startpage=1,endpage=10,tid=0,keyword=''):
up_videos = []
for space_video_page in range(startpage,endpage+1): #最多下载10页 300个视频
time.sleep(3) # 频率不宜过快
space_video_search_params_dict={'host_mid' : up_uid, # UP主UID
'page' : space_video_page, # 页码
'web_location':333.999,
}
w_rid, wts = encWbi(space_video_search_params_dict)
space_video_search_params_urlcoded = urllib.parse.urlencode(space_video_search_params_dict)
up_videos_api = 'https://api.bilibili.com/x/space/wbi/arc/search?%s&w_rid=%s&wts=%s'%(space_video_search_params_urlcoded,w_rid,wts)
space_video_search_json = getjson(up_videos_api,headers=[("credentials","include")])
if space_video_page == startpage:
#获取分类表 如果该页无视频则返回None
# tlist = space_video_search_json['data']['list']['tlist']
# for each in tlist :
# print('tid:',tlist[each]['tid'],'类名:',tlist[each]['name'],'数目:',tlist[each]['count'])
#获取视频总数 如果该页无视频则返回0
space_video_num = space_video_search_json['data']['page']['count']
if space_video_search_json['data']['list']['vlist']: #如果不存在视频则为空列表[]
thisPageVideos = space_video_search_json['data']['list']['vlist']
thisPageVideos.reverse()
thisPageVideos_num = len(thisPageVideos)
for each_video_id in range(thisPageVideos_num):
each_video_info = thisPageVideos[thisPageVideos_num-each_video_id-1]
# up_videos格式
up_videos.append({'title':each_video_info['title'],
'bvid':each_video_info['bvid'],
'author':each_video_info['author'],
'mid':each_video_info['mid'],
'created':each_video_info['created'],
})
if space_video_page == endpage:
print('[√] 已获取 [%d/%d] 个视频'%(len(up_videos),space_video_num))
return up_videos
else:#这页不存在视频
print('[√] 已获取 [%d/%d] 个视频'%(len(up_videos),space_video_num))
return up_videos
def download_img(url,name):
global x
cur_path = os.path.abspath(os.curdir)
cookie = ""
cookies = parse_cookies(cookie)
headers = {'Referer': 'https://www.bilibili.com/','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0'}
res = requests.get(url,headers=headers,cookies=cookies,timeout=(5,5))
if os.path.exists("D:\\B站封面") == False :
os.mkdir("D:\\B站封面")
print("新建文件夹B站封面成功")
if os.path.exists("D:\\B站封面\\"+name+"封面合集") == False :
os.mkdir('D:\\B站封面\\'+name+'封面合集')
print("新建文件夹"+name+"封面合集"+"成功")
for pic in res.json()['data']['items']:
path = "D:\\B站封面\\"+name+"封面合集"
print(pic['cover']['url'])
urllib.request.urlretrieve(pic['cover']['url'], path+'\%s.jpg'%x)
print("Downloading image No.{}".format(x))
x += 1
# 输入uid 返回投稿视频的字典列表
def get_images(up_uid,startpage=1,endpage=10):
for space_video_page in range(startpage,endpage+1): #最多下载10页 300个视频
time.sleep(3) # 频率不宜过快
space_video_search_params_dict={'host_mid' : up_uid, # UP主UID
'page' : space_video_page, # 页码
'web_location':333.999,
}
w_rid, wts =encWbi(space_video_search_params_dict)
space_video_search_params_urlcoded = urllib.parse.urlencode(space_video_search_params_dict)
up_videos_api = 'https://api.bilibili.com/x/polymer/web-dynamic/v1/opus/feed/space?%s&w_rid=%s&wts=%s'%(space_video_search_params_urlcoded,w_rid,wts)
download_img(up_videos_api,up_uid)
#space_video_search_params_dict={'host_mid' : 10835521, # UP主UID
# 'page' : 1, # 页码
# 'web_location':333.999,
# }
#w_rid, wts =encWbi(space_video_search_params_dict)
#space_video_search_params_urlcoded = urllib.parse.urlencode(space_video_search_params_dict)
#up_videos_api = 'https://api.bilibili.com/x/polymer/web-dynamic/v1/opus/feed/space?%s&w_rid=%s&wts=%s'%(space_video_search_params_urlcoded,w_rid,wts)
get_images('3816626')
#print(getUpVideos(3816626))

牛逼呀博主,这个加密算法我研究了好久,自从接口改版之后,之前写的爬虫就全部失效了,都是因为这个随即参数,我虽然定位到这个问题,但是一直没研究出来这个参数要如何生成,6666
我也是看了很多别人写的demo和资料之后总结出来的,自己琢磨确实比较难