简介:爬取“个人简历网”中的简历模板并存储到本地(http://www.gerenjianli.com/moban/index.html)
代码:
import requests
from lxml import etree
import os
if __name__ == '__main__':
# 这是只爬取一页数据
# url = 'http://www.gerenjianli.com/moban/index.html'
#
# headers = {
# 'User-Agent':'这里放自己浏览器的UA就行啦'
# }
# # page_text = requests.get(url=url,headers=headers).text
# response = requests.get(url=url, headers=headers)
# # response.encoding = 'utf-8'
# page_text = response.text
#
# tree = etree.HTML(page_text)
# li_list = tree.xpath('//div[@class="list_boby"]/ul[@class="prlist"]/li')
# # print(li_list)
#
# #创建文件夹
# if not os.path.exists('./resumeLibs'):
# os.mkdir('./resumeLibs')
# for li in li_list:
# a = li.xpath('./div/a/@href')[0]
# name = li.xpath('./div/a/img/@alt')[0]
# name = name.encode('iso-8859-1').decode('gbk')
# download_text = requests.get(url=a,headers=headers).text
# tree = etree.HTML(download_text)
# download_href = tree.xpath('//div[@class="donwurl2"]/a/@href')[0]
#
# doc_data = requests.get(url=download_href,headers=headers).content
# doc_path = 'resumeLibs/' + name + '.docx'
# with open(doc_path,'wb') as fp:
# fp.write(doc_data)
# print(name,'下载成功!')
# 爬取多页数据
headers = {
'User-Agent': '这里放自己浏览器的UA就行啦'
}
# 创建文件夹
if not os.path.exists('./resumeLibs'):
os.mkdir('./resumeLibs')
for pagenum in range(1,4):#这里爬取了1-3页中的简历模板
if pagenum == 1:
url = 'http://www.gerenjianli.com/moban/index.html'
else:
url = 'http://www.gerenjianli.com/moban/index_' + str(pagenum) + '.html'
# page_text = requests.get(url=url,headers=headers).text
response = requests.get(url=url, headers=headers)
# response.encoding = 'utf-8'
page_text = response.text
tree = etree.HTML(page_text)
li_list = tree.xpath('//div[@class="list_boby"]/ul[@class="prlist"]/li')
# print(li_list)
for li in li_list:
a = li.xpath('./div/a/@href')[0]
name = li.xpath('./div/a/img/@alt')[0]
name = name.encode('iso-8859-1').decode('gbk')
download_text = requests.get(url=a, headers=headers).text
tree = etree.HTML(download_text)
download_href = tree.xpath('//div[@class="donwurl2"]/a/@href')[0]
doc_data = requests.get(url=download_href, headers=headers).content
doc_path = 'resumeLibs/' + name + '.docx'
with open(doc_path, 'wb') as fp:
fp.write(doc_data)
print(name, '下载成功!')
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72