2022年 11月 3日

python爬虫 爬取简历模板

简介:爬取“个人简历网”中的简历模板并存储到本地(http://www.gerenjianli.com/moban/index.html)
代码:

import requests
from lxml import etree
import os
if __name__ == '__main__':
    # 这是只爬取一页数据
    # url = 'http://www.gerenjianli.com/moban/index.html'
    #
    # headers = {
    #     'User-Agent':'这里放自己浏览器的UA就行啦'
    # }
    # # page_text = requests.get(url=url,headers=headers).text
    # response = requests.get(url=url, headers=headers)
    # # response.encoding = 'utf-8'
    # page_text = response.text
    #
    # tree = etree.HTML(page_text)
    # li_list = tree.xpath('//div[@class="list_boby"]/ul[@class="prlist"]/li')
    # # print(li_list)
    #
    # #创建文件夹
    # if not os.path.exists('./resumeLibs'):
    #     os.mkdir('./resumeLibs')
    # for li in li_list:
    #     a = li.xpath('./div/a/@href')[0]
    #     name = li.xpath('./div/a/img/@alt')[0]
    #     name = name.encode('iso-8859-1').decode('gbk')
    #     download_text = requests.get(url=a,headers=headers).text
    #     tree = etree.HTML(download_text)
    #     download_href = tree.xpath('//div[@class="donwurl2"]/a/@href')[0]
    #
    #     doc_data = requests.get(url=download_href,headers=headers).content
    #     doc_path = 'resumeLibs/' + name + '.docx'
    #     with open(doc_path,'wb') as fp:
    #         fp.write(doc_data)
    #         print(name,'下载成功!')

    # 爬取多页数据
    headers = {
        'User-Agent': '这里放自己浏览器的UA就行啦'
    }
    # 创建文件夹
    if not os.path.exists('./resumeLibs'):
        os.mkdir('./resumeLibs')

    for pagenum in range(1,4):#这里爬取了1-3页中的简历模板
        if pagenum == 1:
            url = 'http://www.gerenjianli.com/moban/index.html'
        else:
            url = 'http://www.gerenjianli.com/moban/index_' + str(pagenum) + '.html'

        # page_text = requests.get(url=url,headers=headers).text
        response = requests.get(url=url, headers=headers)
        # response.encoding = 'utf-8'
        page_text = response.text

        tree = etree.HTML(page_text)
        li_list = tree.xpath('//div[@class="list_boby"]/ul[@class="prlist"]/li')
        # print(li_list)

        for li in li_list:
            a = li.xpath('./div/a/@href')[0]
            name = li.xpath('./div/a/img/@alt')[0]
            name = name.encode('iso-8859-1').decode('gbk')
            download_text = requests.get(url=a, headers=headers).text
            tree = etree.HTML(download_text)
            download_href = tree.xpath('//div[@class="donwurl2"]/a/@href')[0]

            doc_data = requests.get(url=download_href, headers=headers).content
            doc_path = 'resumeLibs/' + name + '.docx'
            with open(doc_path, 'wb') as fp:
                fp.write(doc_data)
                print(name, '下载成功!')
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72