2022年 11月 5日

python 处理 Html

from urllib.parse import urljoin
from Common.PSoup import *
class HtmlCommon:
  def handleHtmlString(self,htmlString,url,dic={}):
      psoup=PSoup()
      docBody=psoup.getPSoup(htmlString)
      bodyElement = docBody.find("body")

      # <editor-fold desc="填充字典参数的数据">

      str=""
      for item in dic.items():
          key,value = item;
          str=str+"<div id='"+key+"'>"+value+"</div>"

      if bodyElement!=None:
         bodyElement.append(str)
      else:
          htmlString = "<body>" + htmlString + "</body>"
          docBody = psoup.getPSoup(htmlString)
          bodyElement = docBody.find("body")
          bodyElement.append(str)

      htmlString = docBody.html()

      # </editor-fold>

      # <editor-fold desc="替换A标签和Img标签的路径">

      docA = psoup.getPSoup(htmlString)
      elesA = docA.find("a")
      for da in elesA.items():
          href=da.attr("href")
          if href!=None:
              nhref = urljoin(url, href)
              da.attr("href", nhref)



      htmlString = docA.html()
      docI = psoup.getPSoup(htmlString)
      elesI = docI.find("img")
      for ds in elesI.items():
          src=ds.attr("src")
          if src!=None:
              nsrc=urljoin(url,src)
              ds.attr("src",nsrc)




      htmlString=docI.html()

      # </editor-fold>


      return htmlString