2022年 11月 3日

python3论坛_Python3 论坛图片采集脚本

#!/usr/bin/env python3

import urllib.request

import urllib.error

import threading

import re

import os

import hashlib

import time

import sys

root_url = “{Your URL}” # 要访问的论坛的根地址,必须以斜杠结尾,如http://bbs.xx.com/

root_path = “{Your Path}” # 保存的文件夹,必须以斜杠结尾,如/home/xxx/pic/

FORUM_ID = {Your ID} # 论坛页面ID序号 ,如http://bbs.xx.com/forum-9-1.html, 表示其中的9,1为页面号

pthread_number = 0 # 当前线程数量

LOCK = threading.Lock() # 线程锁

p_map = {} # 队列处理词典

def waiting(): # 最大线程数量检查,貌似有点小问题

while pthread_number >= 40:

time.sleep(0.5)

def md5(string): # 获取一个字符串的MD5值,需要进行encode()编码

md5 = hashlib.md5()

md5.update(string)

return md5.hexdigest()

def get_page(url): # 抓取一个网页

rsp = urllib.request.urlopen(url)

page = rsp.read()

return bytes.decode(page, “GBK”).replace(‘\n’, ”).replace(‘\r’, ”)

class DownloadPicture(threading.Thread): # 下载图片的线程

def __init__(self, url, path):

threading.Thread.__init__(self)

self.url = url

self.path = path

def run(self):

global pthread_number

global p_map

# 开始图片下载,添加临界值

LOCK.acquire()

pthread_number += 1

p_map[md5(self.url.encode())] = self.url

LOCK.release()

try:

req = urllib.request.urlopen(self.url)

data = req.read()

if data is ”:

return

f = open(self.path, ‘wb’)

f.write(data)

f.close()

print(self.path)

except urllib.error.HTTPError as e:

print(self.path + “, http error”)

except: # 异常处理不靠谱

print(self.path + “, error”)

finally:

# 结束清理工作

LOCK.acquire()

pthread_number -= 1

del (p_map[md5(self.url.encode())])

LOCK.release()

class DownloadPage(threading.Thread): # 开始读取每一个页面的帖子列表

def __init__(self, page, name, n):

threading.Thread.__init__(self)

self.name = self.filter_name(name)

self.number_flag = n

self.page = root_url + page

self.path = root_path + md5(self.name.encode(“GBK”)).lower()[:2] + “/”

# 开始文件夹的创建

if os.path.isdir(self.path) == False:

os.mkdir(self.path)

self.path = self.path + self.name + “/”

if os.path.isdir(self.path) == False:

os.mkdir(self.path)

def filter_name(self, name): # 过滤文件夹的名字,替换特殊字符

for i in [‘/’, ‘\\’, ‘:’, ‘*’, ‘?’, ‘”‘, ‘<‘, ‘>’, ‘|’, “‘”, ‘)’]:

name = name.replace(i, ‘_’)

return name

def run(self):

page = get_page(self.page)

match = re.findall(‘

(.*?)’, page) # 查找帖子中的内容列表

if len(match) < 1:

return

page = match[0]

matches = re.findall(‘src=”(http[\S]+)”‘, page) # 查找连接

if len(matches) > 0:

list = []

for url in matches:

name = self.get_basename(url) # 获取名字

if os.path.isfile(self.path + name):

print(self.path + name + “, is exists”)

continue # 对于存在的文件直接跳过

waiting() # 线程检查

obj = DownloadPicture(url, self.path + name)

list += [obj]

obj.start()

for obj in list: # 判断线程是否结束

obj.join()

self.check_dir()

else:

print(self.name, “, no picture”)

def get_basename(self, url): # 依据URL获取文件名,当出现错误名称后直接使用MD5名称

name = os.path.basename(url)

ex = os.path.splitext(name)

if ex[1] == ” or self.is_error_name(name):

name = md5(url.encode()) + “.jpg”

return name

def is_error_name(self, name): # 判断是否有错误字符

for i in [‘/’, ‘\\’, ‘:’, ‘*’, ‘?’, ‘”‘, ‘<‘, ‘>’, ‘|’, “‘”, ‘)’]:

if name.find(i) > -1:

return True

return False

def check_dir(self): # 检测文件夹是否为空等情况

# TODO 添加其他操作

print(self.page, self.path, “:”, self.number_flag, ” check finish”)

class ImageCollect(threading.Thread):

def __init__(self, fid, begin, end=1): # 接收参数

threading.Thread.__init__(self)

self.fid = fid

self.begin = begin

self.end = end

def process_page(self, url):

page = get_page(url)

# 开始通过正则查找连接和名称

return re.findall(

‘(.*?)’,

page)

def run(self):

global pthread_number

if os.path.isdir(root_path) == False:

os.mkdir(root_path)

for i in range(self.begin, self.end + self.begin):

url = “%sforum-%s-%s.html” % (root_url, self.fid, i)

list = []

all_page = self.process_page(url)

n = len(all_page)

for obj in all_page:

waiting()

thr = DownloadPage(obj[0], obj[1], n)

list += [thr]

thr.start()

n -= 1

for obj in list:

obj.join()

print(“\nOVER finish”)

pthread_number = -1

if __name__ == ‘__main__’:

if len(sys.argv) != 3: # 参数检查,开始页,要加载的页

print(“Usage: ” + sys.argv[0] + ” begin_page number”)

sys.exit(2)

ic = ImageCollect(FORUM_ID, int(sys.argv[1]), int(sys.argv[2]))

ic.start()

while True: # 每五秒检查一次队列状态

LOCK.acquire()

if pthread_number == -1:

break

print(“TH:”, pthread_number)

for obj in p_map:

print(“TH:”, p_map[obj])

LOCK.release()

time.sleep(5)