最近闲着没事刚好又要写人文讲座报告,然而已经不记得当初听过的讲座有哪些了,只能用小猴偷米查到刷卡的时间,刚好想到每次讲座信息校网上都会发一次通知,就顺手用beautifulsoup写了个爬虫爬讲座信息。
# -*- coding: utf-8 -*-
import os, sys, time, platform, random
import re, json, cookielib
# requirements
import requests, termcolor, html2text
try:
from bs4 import BeautifulSoup
except:
import BeautifulSoup
class Content_Page:
url = None
soup = None
flags = "主办:东南大学文化素质教育中心"
info_file = open("culture_info.txt","a")
def __init__(self,_url):
self.url = _url
r = requests.get(self.url)
self.soup = BeautifulSoup(r.content, "lxml")
def get_content(self):
r = requests.get(self.url)
self.soup = BeautifulSoup(r.content, "lxml")
#print soup
def get_culture_content(self):
if self.soup == None:
self.get_content()
content = self.soup.find_all("p")
if content == None:
return
else:
self.soup.find("span",class_="Article_PublishDate").get_text()
self.info_file.write("--------------------------"+self.soup.find("span",class_="Article_PublishDate").get_text()+"--------------------------------------\n")
#print self.soup.find("span",class_="Article_PublishDate").get_text()
for p in content:
p_content = p.get_text().encode("utf-8")
#print p_content.decode('utf-8').encode("GB18030")
self.info_file.write(p_content+"\n")
def check_culture_content(self):
if self.soup == None:
self.get_content()
content = self.soup.find_all("p")
if content == None:
return False
else:
for p in content:
p_content = p.get_text().encode("utf-8")
if re.match(self.flags, p_content) != None:
return True
class Page:
url = None
soup = None
base_url = "http://www.seu.edu.cn/"
def __init__(self,_url):
self.url = _url
r = requests.get(self.url)
soup = BeautifulSoup(r.content, "lxml")
def get_content(self):
r = requests.get(self.url)
self.soup = BeautifulSoup(r.content, "lxml")
#print soup
def get_culture(self):
if self.soup == None:
self.get_content()
else:
for td in self.soup.find_all("td"):
if td.a != None:
content_page = Content_Page(self.base_url+td.a["href"])
if content_page.check_culture_content():
content_page.get_culture_content()
next_page = self.soup.find("a",class_="next")
self.url = self.base_url + next_page["href"]
self.get_content()
print self.url
if next_page["href"] != "javascript:void(0);":
self.get_culture()
page = Page("http://www.seu.edu.cn/138/list1.htm")
page.get_content()
page.get_culture()
因为是随便写的所以代码很丑,不过也并不想管那么多了,拷下来直接运行就可以得到一个culture_info.txt,然后在txt里搜索你的日期就可以看到你那天听的是哪个讲座了。
如果你懒得爬的话,这里有一份截止到16年4月份的讲座记录,可以直接下载