1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
| import requests,openpyxl from bs4 import BeautifulSoup Y=[2016,2017,2018,2019,2020,2021] M=[1,2,3,4,5,6,7,8,9,10,11,12] D1=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28] D2=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30] D3=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] lm=['02'] mm=['04','06','09','11'] bm=['01','03','05','07','08','10','12']
W1="http://mrxwlb.com/" W2="新闻联播文字版/amp/"
wb=openpyxl.Workbook() sheet=wb.active sheet.title='新闻采集(2016年-2021年)' sheet['A1']='网址' sheet['B1']='标题'
for y in Y: for m in M: if m in lm: for d in D1: url=W1+str(y)+'年'+str(m)+"月"+str(d)+"日"+W2 res_html=requests.get(url) bs_html=BeautifulSoup(res_html.text,"html.parser") try: list_html=bs_html.find("div",class_="cntn-wrp artl-cnt").find_all("li") for title in list_html: name=title.text sheet.append([url,name]) except AttributeError: print("WARNING!!!WARNING!!!WARNING!!!") print(url) print("WARNING!!!WARNING!!!WARNING!!!") elif m in mm: for d in D2: url=W1+str(y)+'年'+str(m)+"月"+str(d)+"日"+W2 res_html=requests.get(url) bs_html=BeautifulSoup(res_html.text,"html.parser") try: list_html=bs_html.find("div",class_="cntn-wrp artl-cnt").find_all("li") for title in list_html: name=title.text sheet.append([url,name]) except AttributeError: print("WARNING!!!WARNING!!!WARNING!!!") print(url) print("WARNING!!!WARNING!!!WARNING!!!") elif m in bm: for d in D3: url=W1+str(y)+'年'+str(m)+"月"+str(d)+"日"+W2 res_html=requests.get(url) bs_html=BeautifulSoup(res_html.text,"html.parser") try: list_html=bs_html.find("div",class_="cntn-wrp artl-cnt").find_all("li") for title in list_html: name=title.text sheet.append([url,name]) except AttributeError: print("WARNING!!!WARNING!!!WARNING!!!") print(url) print("WARNING!!!WARNING!!!WARNING!!!") wb.save('D:\\新闻采集(2016年-2021年).xlsx')
|