urllib request bs4 BeautifulSoup logging, os base_url = 'http://www.xgyw.cc' url_list = [base_url '/Xgyw'] h_list = [] path = r'E:\python\0425\pics' i range(2,5): url_list.append('http://www.xgyw.cc/Xgyw/page_%s.html' %i) print(url_list) headers={
'User-Agent':'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML, likeGecko) Chrome/57.0.2987.110 Safari/537.36'} #解析网址 get_Hrefs(): maxtrynum = 5 hrefs_list hrefs_list = [] url url_list: print('解析页:[%s]' % url) tries range(maxtrynum): : req = request.Request(url=url, headers=headers) res = request.urlopen(req) html = res.read().decode('UTF-8', 'ignore').replace(u'\0xd0', '') # html = res.read().decode('gbk', 'ignore') # print(html) soup = BeautifulSoup(html, 'html.parser') # hrefs = soup.find_all('div', class_='biank1') hrefs = soup.select('a[href^="/Xgyw/Xgyw"]') # print(hrefs) each_href hrefs: hre = each_href.get('href') # print(hre) hrefs_list.append(base_url hre) : tries < (maxtrynum - 1):
continue
:
logging.error("Has tried %d times to access url %s, all failed!", maxtrynum, url)
hrefs_list
# 列表去重
dedupe(list):
L
L=[]
i list:
i L:
i
L.append(i)
L
# 解析page图片页
get_pages_hrefs(href):
times = 5
# for href in href_list:
t range(times):
:
print('找到页面:%s' % href)
req = request.Request(url=href, headers=headers)
res = request.urlopen(req)
html = res.read().decode('UTF-8', 'ignore').replace(u'\0xd0', '')
# print(html)
soup = BeautifulSoup(html, 'html.parser')
pages = soup.select('a[href^="/Xgyw/Xgyw"]')
# print(pages)
each pages:
addr = each.get('href')
# print(addr)
h_list.append('http://www.xgyw.cc' + addr)
:
t < (times - 1):
:
logging.error("Has tried %d times to access url %s, all failed!", times, href)
h_list
# 解析图片&保存图片
parser_pics(list):
n = 5
each_list list:
t range(n):
:
print('解析图片地址:%s' % each_list)
req = request.Request(url=each_list, headers=headers)
res = request.urlopen(req)
html = res.read().decode('UTF-8', 'ignore').replace(u'\0xd0', '')
# print(html)
soup = BeautifulSoup(html, 'html.parser')
pics = soup.select('img[src^="/uploadfile"]')
#print(pics)
each_pic pics:
srcs = each_pic.get('src')
print("解析的图片url:", base_url + srcs)
save_pics(base_url + srcs)
:
t < (n - 1):
:
logging.error("Has tried %d times to access url %s, all failed!", n, each_list)
# 保存图片
save_pics(pic):
fileName = path + os.sep + pic.split("/")[-1]
os.path.exists(fileName):
open(fileName, "wb") f:
print("正在保存:", fileName)
f.write(request.urlopen(pic).read())
__name__=='__main__':
i dedupe(get_Hrefs()):
get_pages_hrefs(i)
print(h_list)
parser_pics(h_list)