爬取猫眼top100

学习目的:了解爬虫工作原理 3部分


1.获取页面内容html代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import requests
from requests.exceptions import RequestException
def get_one_page(url):
try:
response=requests.get(url)
if response.status_code==200:
return response.text
return None
except RequestException:
return None

def main():
url='http://maoyan.com/board/4?'
html=get_one_page(url)
print (html)


if __name__ == '__main__':
main()

结果

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
"C:\Program Files\Python36-32\python.exe" C:/Users/15581/PycharmProjects/maoyantop100/spider.py
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width">
<meta http-equiv="content-type" content="text/html; charset=utf-8">
<link rel="shortcut icon" href="http://ms0.meituan.com/braavos/static/logo.png">
<title>猫眼访问控制</title>
<style>
*{
margin: 0;
padding: 0;
box-sizing: border-box;
}
html, body{
text-align: center;
width: 100%;
max-width: 375px;
margin: auto;
}
header h3{
margin: 20px;
font-family: monospace;
}
footer{
margin-top: 20px;
text-align: center;
}

footer a {
font-size: 14px;
}

@media screen and (max-width: 768px){
body{
text-align: left !important;
}
main{
width: 95%;
}
}

</style>
</head>
<body>
<header>
<!-- <h1>MaoYan Access Control System</h1> -->
<h3>
<p>很抱歉,您的访问被禁止了</p>
<p>如果您认为我们出错了,请联系我们 <a href="mailto:oceanus.feedback@maoyan.com">oceanus.feedback@maoyan.com</a></p>
</h3>
</header>
<footer>
<a href="https://maoyan.com">猫眼电影</a>
</footer>
</body>
</html>


Process finished with exit code 0

2.筛选有用代码(正则表达式)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28

import requests
import re
from requests.exceptions import RequestException
def get_one_page(url):
try:
response=requests.get(url)
if response.status_code==200:
return response.text
return None
except RequestException:
return None

#正则表达式 根据需要的内容写 这些目前看不懂的东西
def parse_one_page(html):#.*?匹配任意字符串 隔开或者想要的目标结果
pattern=re.compile('<dd>.*?board-index.*?>(\d*)</i>.*?data-src="(.*?)".*?name"><a'
+'.*?>(.*?)</a>.*?star">(.*/)</p>.*?releasetime>(>*?)</p>'
+'>*?integer">(>*?)</i>>*?fraction">(.*?)</i>.*?</dd>',re.S)#让.匹配换行符
item=re.findall(pattern,html)
print (items)
def main():
url='http://maoyan.com/board/4?'
html=get_one_page(url)
parse_one_page(html)


if __name__ == '__main__':
main()

结果

1
2
3
4
"C:\Program Files\Python36-32\python.exe" C:/Users/15581/PycharmProjects/maoyantop100/choose.py
[]

Process finished with exit code 0

3.将筛选后的代码变成字典形式并以文本输出到文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import json
import requests
import re
from requests.exceptions import RequestException
def get_one_page(url):
try:
response=requests.get(url)
if response.status_code==200:
return response.text
return None
except RequestException:
return None


def parse_one_page(html):
pattern=re.compile('<dd>.*?board-index.*?>(\d*)</i>.*?data-src="(.*?)".*?name"><a'
+'.*?>(.*?)</a>.*?star">(.*/)</p>.*?releasetime>(>*?)</p>'
+'>*?integer">(>*?)</i>>*?fraction">(.*?)</i>.*?</dd>',re.S)
items = re.findall(pattern,html)
for item in items:
yield {
'index': item[0],
'image': item[1],
'title': item[2],
'actor': item[3].strip()[3:],
'time': item[4].strip()[5:],
'score': item[5]+item[6]
}
def write_to_file(content):
with open('result.txt','a',encoding='utf-8')as f:
f.write(json.dumps(content,ensure_ascii=False)+'\n')
f.close()

def main():
url='http://maoyan.com/board/4?'
html=get_one_page(url)
for item in parse_one_page(html):
print (item)
write_to_file(item)


if __name__ == '__main__':
main()

4.使用循环爬取多个页面

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import json
import requests
import re
from multiprocessing import Pool
from requests.exceptions import RequestException
def get_one_page(url):
try:
response=requests.get(url)
if response.status_code==200:
return response.text
return None
except RequestException:
return None


def parse_one_page(html):
pattern=re.compile('<dd>.*?board-index.*?>(\d*)</i>.*?data-src="(.*?)".*?name"><a'
+'.*?>(.*?)</a>.*?star">(.*/)</p>.*?releasetime>(>*?)</p>'
+'>*?integer">(>*?)</i>>*?fraction">(.*?)</i>.*?</dd>',re.S)
items = re.findall(pattern,html)
for item in items:
yield {
'index': item[0],
'image': item[1],
'title': item[2],
'actor': item[3].strip()[3:],
'time': item[4].strip()[5:],
'score': item[5]+item[6]
}
def write_to_file(content):
with open('result.txt','a',encoding='utf-8')as f:
f.write(json.dumps(content,ensure_ascii=False)+'\n')
f.close()

def main(offset):
url='http://maoyan.com/board/4?offset='+str(offset)
html=get_one_page(url)
for item in parse_one_page(html):
print (item)
write_to_file(item)


if __name__ == '__main__':
for i in range(10):
main(i*10)
##多进程实现
##pool=Pool()
##pool.map(main,[i*10 for i in range(10)])