#!/usr/bin/python3 # -*- coding: utf-8 -*- # @File : Spider # @Author : moucong # @Date : 2018/12/25 16:36 # @Software: PyCharm
from urllib import request from bs4 import BeautifulSoup from urllib.parse import quote from docx.shared import Inches from docx.oxml.ns import qn import string import time import re import docx import os
def spider(): url = "http://www.semi.org.cn/news/news_show.aspx?ID=54725&classid=128" main_url = "http://www.semi.org.cn" page = request.urlopen(url).read().decode('utf-8') # html = page.read().decode('utf-8') soup = BeautifulSoup(page, "lxml") title = soup.title.string title = title.replace('\\n', '').replace('\\t', '').replace('\\r', '').replace("_SEMI大半导体产业网", '') patt = re.compile(r'(.*?)
|', re.S) #寻找img和p标签 group = patt.findall(page) content_list = str(group[0]).split("") file = docx.Document() for count in range(len(content_list)): x = 0 if "img" in content_list[count]: path = "E:/SEMI_job/SEMI_Spider/pic/" if not os.path.isdir(path): os.makedirs(path) paths = path + '\\' pic = re.compile('src="(.*?)"') pic_img = content_list[count] pic_url = pic.findall(pic_img) picurl = main_url+str(pic_url[0]) if ' ' in picurl: picurl = replace(picurl) picurl = quote(picurl, safe=string.printable) pic_path = "E:/SEMI_job/SEMI_Spider/pic/%s.jpg" % x pic = request.urlretrieve(picurl, pic_path) x = x+1 file.add_picture(pic_path, width=Inches(3.0)) elif "strong" in content_list[count]: strong_font = re.compile('(.*?)') strong_type = strong_font.findall(content_list[count]) p = file.add_paragraph() run = p.add_run(strong_type) # 加粗 run.font.bold = True # print(strong_type) else: file.styles['Normal'].font.name = u'宋体' file.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体') #处理word里的字体样式 content_part = content_list[count].replace('\\r', '').replace('\\n', '').replace('\\t', '') file.add_paragraph(content_part) # print(content_part) file.save("E:\SEMI_job\SEMI_Spider\writeResult.docx") print("已处理好!")