资讯详情

爬虫草稿

urllib request bs4  BeautifulSoup logging, os   base_url = 'http://www.xgyw.cc' url_list = [base_url '/Xgyw'] h_list = [] path = r'E:\python\0425\pics' i range(2,5):     url_list.append('http://www.xgyw.cc/Xgyw/page_%s.html' %i) print(url_list) headers={    
     'User-Agent':'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML, likeGecko) Chrome/57.0.2987.110 Safari/537.36'}  #解析网址 get_Hrefs():     maxtrynum = 5     hrefs_list     hrefs_list = []     url url_list:         print('解析页:[%s]' % url)         tries range(maxtrynum):             :                 req = request.Request(url=url, headers=headers)                 res = request.urlopen(req)                 html = res.read().decode('UTF-8', 'ignore').replace(u'\0xd0', '')               # html = res.read().decode('gbk', 'ignore')                 # print(html)                                                                    soup = BeautifulSoup(html, 'html.parser')                                         # hrefs = soup.find_all('div', class_='biank1')                                  hrefs = soup.select('a[href^="/Xgyw/Xgyw"]')                                     # print(hrefs)                                                                   each_href hrefs:                     hre = each_href.get('href')                     # print(hre)                     hrefs_list.append(base_url   hre)                 :                 tries < (maxtrynum - 1):                     
      
       continue 
      :
                    logging.error("Has tried %d times to access url %s, all failed!", maxtrynum, url)
                    hrefs_list

# 列表去重
dedupe(list):
    L
    L=[]
    i list:
        i L:
            i
            L.append(i)
    L
# 解析page图片页
get_pages_hrefs(href):
    times = 5
  # for href in href_list:
    t range(times):
        :
            print('找到页面:%s' % href)
            req = request.Request(url=href, headers=headers)
            res = request.urlopen(req)
            html = res.read().decode('UTF-8', 'ignore').replace(u'\0xd0', '')
          # print(html)
            soup = BeautifulSoup(html, 'html.parser')
            pages = soup.select('a[href^="/Xgyw/Xgyw"]')
          # print(pages)
            each pages:
                addr = each.get('href')
               # print(addr)
                h_list.append('http://www.xgyw.cc' + addr)
            :
            t < (times - 1):
                :
                logging.error("Has tried %d times to access url %s, all failed!", times, href)
                h_list

# 解析图片&保存图片
parser_pics(list):
    n = 5
    each_list list:
        t range(n):
            :
                print('解析图片地址:%s' % each_list)
                req = request.Request(url=each_list, headers=headers)
                res = request.urlopen(req)
                html = res.read().decode('UTF-8', 'ignore').replace(u'\0xd0', '')
                # print(html)
                soup = BeautifulSoup(html, 'html.parser')
                pics = soup.select('img[src^="/uploadfile"]')
                #print(pics)
                each_pic pics:
                    srcs = each_pic.get('src')
                    print("解析的图片url:", base_url + srcs)
                    save_pics(base_url + srcs)
                :
                t < (n - 1):
                    :
                    logging.error("Has tried %d times to access url %s, all failed!", n, each_list)
                    # 保存图片
save_pics(pic):
    fileName = path + os.sep + pic.split("/")[-1]
    os.path.exists(fileName):
        open(fileName, "wb") f:
            print("正在保存:", fileName)
            f.write(request.urlopen(pic).read())



__name__=='__main__':
    i dedupe(get_Hrefs()):
        get_pages_hrefs(i)
    print(h_list)
    parser_pics(h_list)








标签: 油位传感器xgyw310

锐单商城拥有海量元器件数据手册IC替代型号,打造 电子元器件IC百科大全!

锐单商城 - 一站式电子元器件采购平台