2022年 11月 9日

python小说全站爬虫_Python crawler notes(8)示例3:用Python批量爬行全站小说[以图书兴趣亭为例],python,爬虫,笔记,八,实例,爬取,以书趣,阁…

1. 用Python批量爬取全站小说

14fa758cecd907a694777806b8d27155.png

fdb265d066d52a45860435e0cd7878ef.png

2. 爬取一本书

# -*- coding: utf-8 -*-

“””

Created on Sat Feb 8 20:31:43 2020

@author: douzi

“””

import requests

from parsel import Selector

import re

import time

def main():

index_url = ‘http://www.shuquge.com/txt/89644/index.html’ # 想要爬取的小说

tpl = ‘http://www.shuquge.com/txt/89644/’

headers = {“User-Agent”: “Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36”}

# 获取小说目录页

urllist = requests.get(index_url, headers=headers)

index_sel = Selector(urllist.text)

#

《九星毒奶》最新章节

1040 养龙皮?

index = index_sel.css(‘.listmain a::attr(href)’).getall()

# 保存10章节

for n in index:

url = tpl + n

# 第 n 章

response = requests.get(url, headers=headers, timeout=30)

response.encoding = response.apparent_encoding

print(response.request.url)

# xpath css 选择器 提取网页数据结构(html)

# lxml pyquery parsel

sel = Selector(response.text)

title = sel.css(‘h1::text’).get()

print(title)

match = re.search(r'[0-9]*’, title.split()[0])

if match:

with open(“./jiuxin/” + match.group(0) + ‘.txt’, ‘w’, encoding = ‘utf-8’) as f:

f.writelines(title)

#

for line in sel.css(‘#content::text’).getall():

f.writelines(line)

time.sleep(0.5)

if __name__ == ‘__main__’:

main()

3154e456c61d582b51c149aa385ce933.png

3. 爬取一个分类

# -*- coding: utf-8 -*-

“””

Created on Sat Feb 8 20:31:43 2020

@author: douzi

“””

import requests

from parsel import Selector

import re

import time

import os

headers = {“User-Agent”: “Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36”}

# 下载一章节

def download_one_chapter(url, book_name):

# 第 n 章

response = requests.get(url, headers=headers, timeout=30)

response.encoding = response.apparent_encoding

print(response.request.url)

# xpath css 选择器 提取网页数据结构(html)

# lxml pyquery parsel

sel = Selector(response.text)

title = sel.css(‘h1::text’).get()

print(title)

with open(‘./’+book_name+’/’+title+’.txt’,’a+’, encoding = ‘utf-8’) as f:

f.writelines(title)

#

for line in sel.css(‘#content::text’).getall():

f.writelines(line)

f.write(‘\n\0’)

time.sleep(0.5)

# 下载一本书

def download_one_book(index_url, bname):

# index_url = ‘http://www.shuquge.com/txt/89644/index.html’ # 想要爬取的小说:例,九星毒奶

book_name = re.split(‘/’, index_url)[-2] # 例: 89644

tpl = ‘http://www.shuquge.com/txt/’ + book_name + ‘/’

# 获取小说目录页

urllist = requests.get(index_url, headers=headers)

urllist.encoding = urllist.apparent_encoding

index_sel = Selector(urllist.text)

#

《九星毒奶》最新章节

1040 养龙皮?

index = index_sel.css(‘.listmain a::attr(href)’).getall()

for n in index:

url = tpl + n

download_one_chapter(url, bname)

# 下载一类别

def download_one_category():

tpl = ‘http://www.shuquge.com/category/7_{}.html’ # 想要爬取的类别

# 3页

for page in range(1, 4):

category_url = tpl.format(page)

print(category_url)

# 获取小说类别页

cate_list = requests.get(category_url, headers=headers)

cate_list.encoding = cate_list.apparent_encoding

index_sel = Selector(cate_list.text)

books_url = index_sel.css(‘span.s2 a::attr(href)’).getall()

books_name = index_sel.css(‘span.s2 a::text’).getall()

for book_url in books_url:

# 如:变成随身老奶奶 http://www.shuquge.com/txt/109203/index.html

book_name = books_name[books_url.index(book_url)]

print(book_name, book_url)

if os.path.isdir(‘./’ + book_name):

os.removedirs(book_name)

else:

os.mkdir(‘./’ + book_name)

# 下载一本书

download_one_book(book_url, book_name)

if __name__ == ‘__main__’:

# download_one_book(‘asd’)

download_one_category()

2218c70b8a3092fcf6d1fc72a939791e.png

e6ae93041a9e14fb45a16dd8e9c299ae.png

04e71573510bb13ddc985f6af5c17d58.png

dbc491767ae261d4c52fd37222b247cd.gif