1. 用Python批量爬取全站小说
2. 爬取一本书
# -*- coding: utf-8 -*-
“””
Created on Sat Feb 8 20:31:43 2020
@author: douzi
“””
import requests
from parsel import Selector
import re
import time
def main():
index_url = ‘http://www.shuquge.com/txt/89644/index.html’ # 想要爬取的小说
tpl = ‘http://www.shuquge.com/txt/89644/’
headers = {“User-Agent”: “Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36”}
# 获取小说目录页
urllist = requests.get(index_url, headers=headers)
index_sel = Selector(urllist.text)
#
- 1040 养龙皮?
《九星毒奶》最新章节
index = index_sel.css(‘.listmain a::attr(href)’).getall()
# 保存10章节
for n in index:
url = tpl + n
# 第 n 章
response = requests.get(url, headers=headers, timeout=30)
response.encoding = response.apparent_encoding
print(response.request.url)
# xpath css 选择器 提取网页数据结构(html)
# lxml pyquery parsel
sel = Selector(response.text)
title = sel.css(‘h1::text’).get()
print(title)
match = re.search(r'[0-9]*’, title.split()[0])
if match:
with open(“./jiuxin/” + match.group(0) + ‘.txt’, ‘w’, encoding = ‘utf-8’) as f:
f.writelines(title)
#
for line in sel.css(‘#content::text’).getall():
f.writelines(line)
time.sleep(0.5)
if __name__ == ‘__main__’:
main()
3. 爬取一个分类
# -*- coding: utf-8 -*-
“””
Created on Sat Feb 8 20:31:43 2020
@author: douzi
“””
import requests
from parsel import Selector
import re
import time
import os
headers = {“User-Agent”: “Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36”}
# 下载一章节
def download_one_chapter(url, book_name):
# 第 n 章
response = requests.get(url, headers=headers, timeout=30)
response.encoding = response.apparent_encoding
print(response.request.url)
# xpath css 选择器 提取网页数据结构(html)
# lxml pyquery parsel
sel = Selector(response.text)
title = sel.css(‘h1::text’).get()
print(title)
with open(‘./’+book_name+’/’+title+’.txt’,’a+’, encoding = ‘utf-8’) as f:
f.writelines(title)
#
for line in sel.css(‘#content::text’).getall():
f.writelines(line)
f.write(‘\n\0’)
time.sleep(0.5)
# 下载一本书
def download_one_book(index_url, bname):
# index_url = ‘http://www.shuquge.com/txt/89644/index.html’ # 想要爬取的小说:例,九星毒奶
book_name = re.split(‘/’, index_url)[-2] # 例: 89644
tpl = ‘http://www.shuquge.com/txt/’ + book_name + ‘/’
# 获取小说目录页
urllist = requests.get(index_url, headers=headers)
urllist.encoding = urllist.apparent_encoding
index_sel = Selector(urllist.text)
#
- 1040 养龙皮?
《九星毒奶》最新章节
index = index_sel.css(‘.listmain a::attr(href)’).getall()
for n in index:
url = tpl + n
download_one_chapter(url, bname)
# 下载一类别
def download_one_category():
tpl = ‘http://www.shuquge.com/category/7_{}.html’ # 想要爬取的类别
# 3页
for page in range(1, 4):
category_url = tpl.format(page)
print(category_url)
# 获取小说类别页
cate_list = requests.get(category_url, headers=headers)
cate_list.encoding = cate_list.apparent_encoding
index_sel = Selector(cate_list.text)
books_url = index_sel.css(‘span.s2 a::attr(href)’).getall()
books_name = index_sel.css(‘span.s2 a::text’).getall()
for book_url in books_url:
# 如:变成随身老奶奶 http://www.shuquge.com/txt/109203/index.html
book_name = books_name[books_url.index(book_url)]
print(book_name, book_url)
if os.path.isdir(‘./’ + book_name):
os.removedirs(book_name)
else:
os.mkdir(‘./’ + book_name)
# 下载一本书
download_one_book(book_url, book_name)
if __name__ == ‘__main__’:
# download_one_book(‘asd’)
download_one_category()