网络爬虫

依赖BeautifulSoup Python

主文件spider_main.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# -*- coding:utf-8 -*-
# 主程序,负责根url,以及启动爬虫
import html_downloder
import html_output
import html_parser
import url_manager


class SpilderMain:
def __init__(self):
# 初始化所需要的对象,包括url管理器,网页下载器,网页解析器,输出器
# 来提供给craw()使用
# 来提供给craw()使用
self.urls = url_manager.UrlManager()
self.downloader = html_downloder.HtmlDownloader()
self.parser = html_parser.HtmlParser()
self.outputer = html_output.HtmlOutputer()

def craw(self, url):
count = 1 # url计数
# 添加根url
self.urls.add_new_url(url)
# 开始解析
while self.urls.has_new_url():
try:
# 获取url
new_url = self.urls.get_new_url()
print("第%d个url:%s" % (count, new_url))
# 将url对应的页面进行下载
html_cont = self.downloader.download(new_url)
# 对下载下来的页面进行解析,将解析出来的数据进行保存
new_urls, new_data = self.parser.parse(new_url, html_cont)
# 将解析出来的urls添加到url_manager
self.urls.add_new_urls(new_urls)
# 将数据进行收集
self.outputer.collect_data(new_data)

if count == 15:
break
count += 1
except:
print("爬取失败")

self.outputer.output_html()


if __name__ == "__main__":
root_url = "http://baike.baidu.com/view/21087.htm"
obj_spider = SpilderMain()
# 启动爬虫
obj_spider.craw(root_url)

网页下载器html_downloder.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
# -*- coding:utf-8 -*-
# 页面下载器
import urllib.request


class HtmlDownloader(object):
def download(self, url):
if url is None:
return None
response = urllib.request.urlopen(url)
if response.getcode() != 200:
return None
return response.read()
#页面下载器

网页解析器html_parser.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# -*- coding:utf-8 -*-
# 网页解析器
import re
from bs4 import BeautifulSoup
import urllib.parse


class HtmlParser(object):
# 对html_cont的内容进行解析
def parse(self, page_url, html_cont):
if page_url is None or html_cont is None:
return
soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')
new_urls = self._get_new_urls(page_url, soup)
new_data = self._get_new_data(page_url, soup)
return new_urls, new_data

# 获取页面上所有的url
def _get_new_urls(self, page_url, soup):
new_urls = set()
# 根据分析,链接的格式是:/view/12334.htm
links = soup.find_all('a', href=re.compile(r"/view/\d+\.htm"))
for link in links:
new_url = link['href']
# url格式需要进行拼接,加上 http://baike.baidu.com
new_full_url = urllib.parse.urljoin(page_url, new_url)
new_urls.add(new_full_url)
return new_urls

# 获取每一页面的数据,包括标题以及简介
def _get_new_data(self, page_url, soup):
# 以一个词典数据类型保存数据
res_data = {}
# 保存url
res_data['url'] = page_url
# 下面是标题的格式
# <dd class="lemmaWgt-lemmaTitle-title"> <h1>Python</h1>
title_node = soup.find('dd', class_="lemmaWgt-lemmaTitle-title").find('h1')
res_data['title'] = title_node.get_text()
# 开始获取简介的内容
# <div class="lemma-summary" label-module="lemmaSummary">
summary_node = soup.find('div', class_="lemma-summary")
res_data['summary'] = summary_node.get_text()
return res_data

URL管理器url_manager.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# -*- coding:utf-8 -*-
# url管理器


class UrlManager(object):
def __init__(self):
self.new_urls = set()
self.old_urls = set()

# 添加url
def add_new_url(self, url):
if url is None:
return
# 当url既不在新的列表中也不在旧的列表中时,则将其添加到新的url——set()集合中
if url not in self.new_urls and url not in self.old_urls:
self.new_urls.add(url)

def add_new_urls(self, urls):
if urls is None or len(urls) == 0:
return
for url in urls:
self.add_new_url(url)

# 判断是否还有待爬取的url
def has_new_url(self):
return len(self.new_urls) != 0

def get_new_url(self):
new_url = self.new_urls.pop()
self.old_urls.add(new_url)
return new_url

定义路径输出结果


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35

# -*- coding:utf-8 -*-
# 最后的结果输出
# 提供两个功能,一个事收集数据,另一个是输出数据


class HtmlOutputer(object):
# 收集数据需要一个列表list进行维护
def __init__(self):
self.datas= []

def collect_data(self, data):
if data is None:
return
self.datas.append(data)

# 输出一个html文档
def output_html(self):
fileout = open("output.html", "w", encoding='utf-8')
fileout.write("<html>")
fileout.write("<head>")
fileout.write("<meta charset=\'utf-8\'>")
fileout.write("</head>")
fileout.write("<body>")
fileout.write("<table>")
for data in self.datas:
fileout.write("<tr>")
fileout.write("<td>%s</td>" % data['url'])
fileout.write("<td>%s</td>" % data['title'])
fileout.write("<td>%s</td>" % data['summary'])
fileout.write("</tr>")
fileout.write("</table>")
fileout.write("</body>")
fileout.write("</html>")
fileout.close()