需求背景:我们有自己的平台,但我们希望在我们的平台上实现百度搜索界面,输入要搜索的内容,模拟百度搜索,并在我们自己的平台上显示返回的内容,供用户查看。
# -- coding:utf8 -- import hashlib import random import sys import time from pprint import pprint from urllib.parse import quote from bs4 import BeautifulSoup from requests_html import HTMLSession # 代理ip class GetProxy: _instance = None @staticmethod def get_random_proxy(): while not GetProxy._instance: GetProxy._instance = ['211.149.199.235:16818', '123.57.57.125:16818'] proxy = random.choice(GetProxy._instance).strip() return proxy """返回HTTP/HTTPS的代理IP,可根据代理IP类型更改""" def get_ip_http(self): proxies = {
"http": self.get_random_proxy(), } return proxies def get_ip_https(self): proxies = {
"https": self.get_random_proxy(), } return proxies # 随机获取一个cookie def get_random_cookie(): cookies = [ 'PSTM=1645440125; BAIDUID=49D5966BB6F2D98A8378EC10151CE748:FG=1; BAIDUID_BFESS=49D5966BB6F2D98A8378EC10151CE748:FG=1; BIDUPSID=5C48EADF0E27C74CB11F290539E5EAA8; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; __yjs_duid=1_6b058121c11c500f39afbc042ec623711645440178604; delPer=0; PSINO=7; MCITY=-257:; BA_HECTOR=05a0ak0ga42525a5us1h18lb30r; BDRCVFR[C0p6oIjvx-c]=rJZwba6_rOCfAF9pywd; H_PS_PSSID=35105_35865_34584_3541_35872_35246_35319; ab_sr=1.0.1_ZGM2MTQ3YjE2NGE0ZmE2NWNhNGYzMDQ1Nzg1ZWYxYWFjZDllZjA1NzY0YWE3NjVjZmEyNjA4NmE5NTljZTEzOTFkNzViMWRlNTA4ZmQwYWIzYWZlYjQyMDYxZTcxNGI0NWVjYzU5ODk0ZDVmYmNkZDI4YzkyNGEwNTUwZjc4MWU3Y2Q0ZTUzOGExNjQwZTgzMzM4ZjQ2ZjkzMjE0OGNjZA==; BAIDU_WISE_UID=wapp_1645499858512_985',
'BIDUPSID=0AB15879656FD166028DF65039BDFF15; PSTM=1641442191; BAIDUID=911EF71E90573B2693EC612910B1F7BE:FG=1; BCLID_BFESS=9239639223377566883; BDSFRCVID_BFESS=1T-OJeCmHxdstirHc7RXbo9jumKK0gOTHllnPXllHP8_1buVJeC6EG0Ptf8g0KubFTPRogKK0gOTH6KF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF_BFESS=tJkD_I_hJKt3fP36q6_a2-F_2xQ0etJXf5Txbp7F5lOVO-ngKU613MkSjNOj5t482jTLahkM5h7xObR1hl3ih-An0a7dJ4jtQeQ-5KQN3KJmfbL9bT3v5tDz3b3N2-biWbRM2MbdJqvP_IoG2Mn8M4bb3qOpBtQmJeTxoUJ25DnJhhCGe6-MjT3-DG8jqbvEHDc-WJ3t-TrjDCvRhMjcy4LdjG5N0PJT5bv73K022boobJcGLqjW0R_X3-Aq54RMagQwLPJEytQTS-5VbtoMQfbQ0-cOqP-jWbnu-qTo2n7JOpkRbUnxy50vQRPH-Rv92DQMVU52QqcqEIQHQT3m5-5bbN3ht6IHJbCJoDD5tIvbfP0kjjQWMt_h-fuX5-CstGPL2hcH0b61JbbR5-rKy-JW0R7a25cBbCjiaKJjBMb1DbRk0h7ShMkrebPD5JQpWDTm_q5TtUJMeCnTDMRh-xK70b5yKMnitIv9-pPKWhQrh459XP68bTkA5bjZKxtq3mkjbPbDfn028DKu-n5jHj3WDG-J3q; __yjs_duid=1_ada3d0ac8d4be7042dd53d52221555631641452261829; BAIDUID_BFESS=911EF71E90573B2693EC612910B1F7BE:FG=1; BD_HOME=1; H_PS_PSSID=35104_31660_34584_35490_35841_35887_35542_35318_26350_35867_22158; BD_UPN=12314753; delPer=0; BD_CK_SAM=1; PSINO=7; H_PS_645EC=09c89Z6QKcJ4xzJZr1LUqxrp0qdbpltyn/ixDDrfq5R6r0cQWwLiJT3HLZY; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BA_HECTOR=a424810gag04818hg31h15uop0q; baikeVisitId=492b5e23-3a27-4d6d-bf0a-ab5907361a87; BDSVRTM=643']
cooke = random.choice(cookies).strip()
return cooke
def search_kw(kw, pn, tp):
cookie = get_random_cookie()
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
"Referer": "https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=2&ch=&tn=baiduhome_pg&bar=&wd=123&oq=123&rsv_pq=896f886f000184f4&rsv_t=fdd2CqgBgjaepxfhicpCfrqeWVSXu9DOQY5WyyWqQYmsKOC%2Fl286S248elzxl%2BJhOKe2&rqlang=cn",
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7",
"Sec-Fetch-Mode": "navigate",
"Cookie": cookie,
"Connection": "Keep-Alive",
}
if cookie:
if "__yjs_duid" not in cookie:
pass
else:
_ = cookie.split("__yjs_duid=")
__ = _[1].split(";", 1)[-1]
___ = hashlib.md5()
cookie = _[0] + "__yjs_duid=1_" + str(___.hexdigest()) + __
headers["Cookie"] = cookie + ";random=" + str(random.randint(500, 4000))
text = quote(kw, "utf-8")
rsv_page = ''
cp = 1
if pn == 'np':
rsv_page = '&rsv_page=1'
else:
cp = int(pn)
if tp == '1': # 网页
url = f"https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&tn=baidu&wd={
kw}&oq={
text}&pn={
(cp - 1) * 10}&inputT={
random.randint(500, 4000)}{
rsv_page}"
elif tp == '2': # 图片
url = f"https://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&word={
kw}&oq={
text}&pn={
(cp - 1) * 10}&inputT={
random.randint(500, 4000)}{
rsv_page}"
# elif tp == '3': # 资讯
# url = f"https://www.baidu.com/s??rtt=1&bsst=1&cl=2&tn=news&word={kw}&oq={text}&pn={(cp - 1) * 10}&inputT={random.randint(500, 4000)}{rsv_page}"
# elif tp == '4': # 知道
# url = f"https://zhidao.baidu.com/search?ie=utf-8&f=8&rsv_bp=1&tn=baidu&word={kw}&oq={text}&pn={(cp - 1) * 10}&inputT={random.randint(500, 4000)}{rsv_page}"
# elif tp == '5': # 贴吧
# url = f"https://tieba.baidu.com/f?fr=wwwt&ie=utf-8&kw={kw}&oq={text}&pn={(cp - 1) * 10}&inputT={random.randint(500, 4000)}{rsv_page}"
# elif tp == '6': # 视频
# url = f"https://www.baidu.com/sf/vsearch?pd=video&tn=vsearch&ie=utf-8&wrsv_spt=10&wd={kw}&oq={text}&pn={(cp - 1) * 10}&inputT={random.randint(500, 4000)}{rsv_page}"
elif tp == '3': # 文库
url = f"https://wenku.baidu.com/search?lm=0&od=0&ie=utf-8&word={
kw}&oq={
text}&pn={
(cp - 1) * 10}&inputT={
random.randint(500, 4000)}{
rsv_page}"
elif tp == '4': # 百科
url = f"https://baike.baidu.com/search?lm=0&od=0&ie=utf-8&word={
kw}&oq={
text}&pn={
(cp - 1) * 10}&inputT={
random.randint(500, 4000)}{
rsv_page}"
proxies = GetProxy().get_ip_http()
session = HTMLSession()
r = session.get(url, headers=headers, proxies=proxies)
r.encoding = r.apparent_encoding
content_html = r.html.html
soup = BeautifulSoup(content_html, features="html.parser")
# 删除无用元素
if tp == '1': # 网页
head = soup.find("div", id="head")
if head:
head.decompose()
s_tab = soup.find("div", id="s_tab")
if s_tab:
s_tab.decompose()
content_right = soup.find("div", id="content_right")
if content_right:
content_right.decompose()
rs_new = soup.find("div", id="rs_new")
if rs_new:
rs_new.decompose()
foot = soup.find("div", id="foot")
if foot:
foot.decompose()
elif tp == '2': # 图片
head = soup.find("div", id="head")
if head:
head.decompose()
s_tab = soup.find("div", id="s_tab")
if s_tab:
s_tab.decompose()
content_right = soup.find("div", id="content_right")
if content_right:
content_right.decompose()
rs_new = soup.find("div", id="rs_new")
if rs_new:
rs_new.decompose()
foot = soup.find("div", id="foot")
if foot:
foot.decompose()
bdpcImgTab = soup.find("div", id="bdpcImgTab")
if bdpcImgTab:
bdpcImgTab.decompose()
topInfoBar = soup.find("div", id="topInfoBar")
if topInfoBar:
topInfoBar.decompose()
# elif tp == '3': # 资讯
# head = soup.find("div", id="head")
# if head:
# head.decompose()
# s_tab = soup.find("div", id="s_tab")
# if s_tab:
# s_tab.decompose()
# content_right = soup.find("div", id="content_right")
# if content_right:
# content_right.decompose()
# rs_new = soup.find("div", id="rs_new")
# if rs_new:
# rs_new.decompose()
# foot = soup.find("div", id="foot")
# if foot:
# foot.decompose()
#
#
# elif tp == '4': # 知道
# head = soup.find("div", id="head")
# if head:
# head.decompose()
# s_tab = soup.find("div", id="s_tab")
# if s_tab:
# s_tab.decompose()
# content_right = soup.find("div", id="content_right")
# if content_right:
# content_right.decompose()
# rs_new = soup.find("div", id="rs_new")
# if rs_new:
# rs_new.decompose()
# foot = soup.find("div", id="foot")
# if foot:
# foot.decompose()
# right_billboard = soup.find("div", id="right-billboard")
# if right_billboard:
# right_billboard.decompose()
#
#
# elif tp == '5': # 贴吧
# head = soup.find("div", id="head")
# if head:
# head.decompose()
# s_tab = soup.find("div", id="s_tab")
# if s_tab:
# s_tab.decompose()
# content_right = soup.find("div", id="content_right")
# if content_right:
# content_right.decompose()
# rs_new = soup.find("div", id="rs_new")
# if rs_new:
# rs_new.decompose()
# foot = soup.find("div", id="foot")
# if foot:
# foot.decompose()
# pagelet_head = soup.find("div", id="pagelet_frs-header/pagelet/head")
# if pagelet_head:
# pagelet_head.decompose()
# normal_aside = soup.find("div", id="pagelet_frs-aside/pagelet/normal_aside")
# if normal_aside:
# normal_aside.decompose()
# content_footer = soup.find("div", id="pagelet_frs-footer/pagelet/content_footer")
# if content_footer:
# content_footer.decompose()
#
#
# elif tp == '6': # 视频
# head = soup.find("div", id="head")
# if head:
# head.decompose()
# s_tab = soup.find("div", id="s_tab")
# if s_tab:
# s_tab.decompose()
# content_right = soup.find("div", id="content_right")
# if content_right:
# content_right.decompose()
# rs_new = soup.find("div", id="rs_new")
# if rs_new:
# rs_new.decompose()
# foot = soup.find("div", id="foot")
# if foot:
# foot.decompose()
# con_ar = soup.find("div", id="con-ar")
# if con_ar:
# con_ar.decompose()
elif tp == '3': # 文库
head = soup.find("div", id="head")
if head:
head.decompose()
s_tab = soup.find("div", id="s_tab")
if s_tab:
s_tab.decompose()
content_right = soup.find("div", id="content_right")
if content_right:
content_right.decompose()
rs_new = soup.find("div", id="rs_new")
if rs_new:
rs_new.decompose()
foot = soup.find("div", id="foot")
if foot:
foot.decompose()
elif tp == '4': # 百科
head = soup.find("div", iclass_="wiki-common-headTab")
if head:
head.decompose()
foot = soup.find("div", class_="wgt-footer-main")
if foot:
foot.decompose()
page_content = soup.find("div", id="page")
if page_content:
links = page_content.findAll("a")
if links:
for link in links:
span = link.find('span')
cp = 'np'
if span:
cp = span.get_text()
href = f"/article/searchEverything?wd={
kw}&pn={
cp}&tp={
tp}"
link['href'] = href
soup_content = soup.prettify(encoding="utf-8", formatter='minimal')
print(soup_content)
if __name__ == '__main__':
kw = '广州市区'
pn = '1'
tp = '4'
if len(sys.argv) == 4:
kw = sys.argv[1]
pn = sys.argv[2]
tp = sys.argv[3]
elif len(sys.argv) == 3:
pn = sys.argv[1]
tp = sys.argv[2]
result = search_kw(kw, pn, tp)