一、概要
爬取目标:
https://www.gsmchoice.com/zh-cn/catalogue/
爬升级:3 爬行信息:各品牌、各型号手机规格数据
二、环境
- Python
3.9.7
- beautifulsoup4
4.10.0
- bs4
0.0.1
- certifi
2021.5.30
- charset-normalizer
2.0.6
- idna
3.2
- lxml
4.6.3
- pip
21.2.4
- requests
2.26.0
- setuptools
57.4.0
- soupsieve
2.2.1
- urllib3
1.26.7
三、Code
爬三级要爬url
'''* @Description: 爬取 www.gsmchoice.com 三级网页手机信息* @Param: url level info* @return: phone_info* @Author: zhangjinke@corp.netease.com* @Date: 2021-09-22* 三级页面规则明显,不使用re库'''import randomimport reimport requestsfrom bs4 import BeautifulSoup''' in : 一级url out : 二级url https://www.gsmchoice.com/zh-cn/catalogue/nec/'''def craw_lev1(base_url, url): li = [] req_headers= dict() user_agent_list = ["Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/68.0.3440.106 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/67.0.3396.99 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/64.0.3282.186 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/62.0.3202.62 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/45.0.2454.101 Safari/537.36", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)", "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15", ] req_headers['User-Agent'] = random.choice(user_agent_list) req_obj = requests.get(url,headers=req_headers) bresp = BeautifulSoup(req_obj.text,'lxml') CatalogueBrands = bresp.find(id='CatalogueBrands') a = CatalogueBrands.find_all('a') for item in a: if ("https" in item['href']): # 确认没有重复的框架内href没有重复的,一层url不去重直接追加 li.append(item['href']) else: li.append(base_url + item['href']) return li''' in : 二级url out : 三级url https://www.gsmchoice.com/zh-cn/catalogue/nec/mediasxn06e/'''def craw_lev2(url): soup_a = [] base_url3 = [] base_url = "https://www.gsmchoice.com/" factory = url.split('/')[-3] reg_key = 'href="/zh-cn/catalogue/' + factory + '/\w*' req_obj = requests.get(url) soup = BeautifulSoup(req_obj.text,'html.parser') soup_len = len(soup.find_all('div',class_='phone-container phone-container--left')) if soup_len == 2: soup_a = soup.find_all('div',class_='phone-container phone-container--left')[0].find_all('a')+soup.find_all('div',class_='phone-container phone-container--left')[1].find_all('a') else: soup_a = soup.find_all('div',class_='phone-container phone-container--left')[0].find_all('a') for i in soup_a: reg = re.compile(reg_key) x = reg.findall(str(i))[0] base_url3.append(base_url + str(x).split('"/')[1]) return base_url3def page_num(u): req_obj = requests.get(u) soup = BeautifulSoup(req_obj.text,'html.parser') b = soup.find_all('b') num = re.findall("\d+",str(b[-3]))[0] return numif __name__ == '__main__': base_url = "https://www.gsmchoice.com" url_lev1 = "https://www.gsmchoice.com/zh-cn/catalogue/" #410个品牌 url_lev2 = craw_lev1(base_url,url_lev1) # #check每一二级页面的手机个数 # print (craw_lev1(base_url,url)[i],page_num(craw_lev1(base_url,url)[i])) #拿二级(手机品牌)分页 取三级(手机品牌-手机型号) with open("/Users/zjk/IdeaProjects/test_Python/resource/craw_results.txt",'a' ,encoding="utf-8") as file: for iu in url_lev2: url_lev3 = [] intn = int(page_num(iu)) if intn%40 == 0: n = intn//40 else: n = intn//40 + 1 #爬取 二级分页的三级url for x in range(0,n): # real_url = https://www.gsmchoice.com/zh-cn/catalogue/huawei/models/80 real_url = iu + "models/" + str(x*40) staus_code = requests.get(real_url).status_code url_lev3 += craw_lev2(real_url) print(str(staus_code)+"-成功爬取:"+real_url) for m in url_lev3: file.write(m+"\n")
遍历文件内三级url,爬取要的手机信息
【单线程】
#coding:utf-8'''* @Description: 爬取指定单页url的 数据 分行存储\t\t\t分割* @Param: url_lev3* @return: 每一种手机模型数据* @Author: zhangjinke@corp.netease.com* @Date: 2021-09-23'''import randomimport refrom multiprocessing.pool import ThreadPoolimport requestsimport unicodedatafrom bs4 import BeautifulSoup# import logging# logging.captureWarnings(True)from requests.packages import urllib3urllib3.disable_warnings()def get_soup(url_lev3): soup_one = "null" soup_two = "null" real_sout_li = [] req_headers= dict() user_agent_list = ["Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)", "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15", ] req_headers['User-Agent'] = random.choice(user_agent_list) #源html提取 [soup_1,soup_2] req_obj = requests.get(url_lev3,headers=req_headers) req_obj.encoding = req_obj.apparent_encoding soup = BeautifulSoup(req_obj.text,'lxml') soup_t = soup.find_all(class_='PhoneData YesDict') real_sout_li.append(str(soup_t[0])) for sou in soup_t: html_text = str(sou) if '加速度计' in html_text: real_sout_li.append(html_text) #html转soup对 soup_one = BeautifulSoup(real_sout_li[0],'lxml') if len(real_sout_li) == 1: return [soup_one,'null'] soup_two = BeautifulSoup(real_sout_li[1],'lxml') return [soup_one,soup_two]def craw_cell1(soup_1): #分别 正则提取 替换 #key1 item = re.sub(r'\t*|\n*|\[|\]','',unicodedata.normalize('NFKC', str(soup_1.find_all(class_='phoneCategoryName')).replace('\xa0',''))) key_item = str(item).replace('<th class="phoneCategoryName">','').replace('</th>','') key_li = key_item.split(', ') #value1 item_v = re.sub(r'\t*|\n*|\[|\]','',str(soup_1.find_all(class_='phoneCategoryValue'))) item_v_li = item_v.split('</td>, <td class=') for index in range(0,len(item_v_li)): if 'procSection' in item_v_li[index]: item_v_li[index] = re.sub('(.*)ue">|<div(.*)ction">|<div(.*)Gpu">|</div>|\xa0','',item_v_li[index].replace('<br/>',' ')) elif ('<div' in item_v_li[index]) & ('<span' in item_v_li[index]): ss = re.sub(r'.*<span class="|<a(.*)/a>|"> </span>','',item_v_li[index]) cont = ss.split('">')[1].split('</div>')[0] if 'tick' in item_v_li[index]: item_v_li[index] = 'yes '+ cont elif 'cross' in item_v_li[index]: item_v_li[index] = 'no '+ cont else: item_v_li[index] = 'unknown' elif '<div' in item_v_li[index]: item_v_li[index] = item_v_li[index].split('<div c')[0].split('">')[-1] elif '<span' in item_v_li[index]: mark = item_v_li[index].split('<span class="')[1].split('">')[0] if mark=='tick': item_v_li[index] = 'yes' elif mark=='question': item_v_li[index] = 'unknown' else: item_v_li[index] = 'no' elif '</a>' in item_v_li[index]: item_v_li[index] = item_v_li[index].split('</a>')[0].split('">')[-1] else: item_v_li[index] = item_v_li[index].split('">')[-1].replace('<br/>',' ') #统一处理 漏筛的脏字符 item_v_li[index] = re.sub(r'<b>|</b>|<br/>|<div cla(.*)\"\>\/\>','',re.sub(r'<div(.*)</div>|<img cl(.*)"/>|<a href="(.*)"|<span class="(.*)</span>|<div cla(.*)\"\>\/\>|\<\/(.?)\>','',item_v_li[index])) item_v_li[-1] = item_v_li[-1].replace('</td>','') if len(key_li) == len(item_v_li): res_li = { } for ind in range(0,len(key_li)): res_li[key_li[ind]] = item_v_li[ind] return res_lidef craw_cell2(soup_2): res_li = { } sub = re.sub(r'\t*|\n*|\xa0', '', str(soup_2)) findall_key = re.findall(r'me">(.+?)</div>',sub) findall_value = re.findall(r'<span class="(.+?)">',sub) if len(findall_key)== len(findall_value): try: for i in range(0,len(findall_key)): mark = 'yes' if findall_value[i] == 'cross': mark = 'no' elif findall_value[i] == 'cross': mark = 'unknown' res_li[findall_key[i]] = mark except Exception as va: print('cell2报错:%s' %va) return res_liif __name__ == '__main__': _path = 'D:\\Py_T\\resources\\craw_results.txt' path_ = 'D:\\Py_T\\resources\\result_2021.txt' #设置线程并行 pool = ThreadPool(5) #遍历url 爬取 with open(_path,'r' ,encoding="utf-8") as _file, open(path_, 'a', encoding="utf-8") as file_: for url in _file: if 200 == requests.get(url).status_code: print('开始爬取: '+url) r_a = craw_cell1(get_soup(url)[0]) r_b = craw_cell2(get_soup(url)[1]) result = dict(list(r_a.items()) + list(r_b.items())) #${result} #{'牌子': 'Samsung Smartphones', '模型': 'Galaxy M52', '手机其他名称': 'SM-M526B/DS, SM-M526BR/DS', 'Standardy': 'GSM, UMTS, LTE, 5G', 'GSM frequencies': '850 900 1800 1900 ', 'UMTS标准': '850 900 1700 1900 2100 ', 'Standard LTE': 'FDD LTE: 2100, 1800, 900, 850 TDD LTE: 2600, 2500, 2300, 1900', 'Standard 5G': 'yes', '手机规格': '触控手机', '防水性和防尘性': 'IP67', '大小': '164.60 x 76.90 x 8.20 mm', 'Weight': '176.00 g', 'Display': '彩色 / Super AMOLED 16M 颜色,\xa0120 Hz 1080 x 2400 px\xa0(6.70")\xa0393 ppi ∼85.6% screen-to-body ratio', '显示保护': 'Corning Gorilla Glass 5', '通话最长时间': 'unknown', '待机最长时间': 'unknown', '标准电池': 'Li-Ion 5000 mAh', '快速充电': 'Fast Charging ', '无线充电': 'no', '手机存储': '128 GB, 256 GB', '随机存取存储 (RAM)': '6 GB, 8 GB', 'Memory cards': 'yes', 'Operating system': 'Android 11', '接口': 'One UI 3.1', '处理器': 'Qualcomm Snapdragon 778G Processor clock: 2.40 GHz 芯的数目: 8 GPU: Adreno 642L ', 'Touchscreen': 'yes', '双SIM卡': 'yes', 'SIM卡标准': 'nanoSIM, nanoSIM', '卡双模式': 'dual standby', '混合双卡双待': 'nanoSIM, microSD', '发行日期': '第 3 季 2021', '加速度计': 'yes', '接近
传感器': 'no', '光传感器': 'yes', '磁力仪': 'yes', '陀螺仪': 'yes', '晴雨表': 'no', '高度表': 'no', '重力感应器': 'yes', '霍尔�