学习目的:了解爬虫工作原理 3部分
1.获取页面内容html代码
1  | import requests  | 
结果1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61"C:\Program Files\Python36-32\python.exe" C:/Users/15581/PycharmProjects/maoyantop100/spider.py
<html>
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width">
<meta http-equiv="content-type" content="text/html; charset=utf-8">
<link rel="shortcut icon" href="http://ms0.meituan.com/braavos/static/logo.png">
<title>猫眼访问控制</title>
<style>
*{
  margin: 0;
  padding: 0;
  box-sizing: border-box;
}
html, body{
  text-align: center;
  width: 100%;
  max-width: 375px;
  margin: auto;
}
header h3{
  margin: 20px;
  font-family: monospace;
}
footer{
  margin-top: 20px;
  text-align: center;
}
footer a {
  font-size: 14px;
}
@media screen and (max-width: 768px){
  body{
    text-align: left !important;
  }
  main{
    width: 95%;
  }
}
</style>
</head>
<body>
  <header>
    <!-- <h1>MaoYan Access Control System</h1> -->
    <h3>
      <p>很抱歉,您的访问被禁止了</p>
      <p>如果您认为我们出错了,请联系我们 <a href="mailto:oceanus.feedback@maoyan.com">oceanus.feedback@maoyan.com</a></p>
    </h3>
  </header>
  <footer>
    <a href="https://maoyan.com">猫眼电影</a>
  </footer>
</body>
</html>
Process finished with exit code 0
2.筛选有用代码(正则表达式)1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import requests
import re
from requests.exceptions import RequestException
def get_one_page(url):
    try:
        response=requests.get(url)
        if response.status_code==200:
           return response.text
        return None
    except RequestException:
        return None
    #正则表达式 根据需要的内容写  这些目前看不懂的东西
def parse_one_page(html):#.*?匹配任意字符串  隔开或者想要的目标结果
    pattern=re.compile('<dd>.*?board-index.*?>(\d*)</i>.*?data-src="(.*?)".*?name"><a'
                       +'.*?>(.*?)</a>.*?star">(.*/)</p>.*?releasetime>(>*?)</p>'
                       +'>*?integer">(>*?)</i>>*?fraction">(.*?)</i>.*?</dd>',re.S)#让.匹配换行符
    item=re.findall(pattern,html)
    print (items)
def main():
    url='http://maoyan.com/board/4?'
    html=get_one_page(url)
    parse_one_page(html)
if __name__ == '__main__':
    main()
结果1
2
3
4"C:\Program Files\Python36-32\python.exe" C:/Users/15581/PycharmProjects/maoyantop100/choose.py
[]
Process finished with exit code 0
3.将筛选后的代码变成字典形式并以文本输出到文件1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43import json
import requests
import re
from requests.exceptions import RequestException
def get_one_page(url):
    try:
        response=requests.get(url)
        if response.status_code==200:
           return response.text
        return None
    except RequestException:
        return None
def parse_one_page(html):
    pattern=re.compile('<dd>.*?board-index.*?>(\d*)</i>.*?data-src="(.*?)".*?name"><a'
                       +'.*?>(.*?)</a>.*?star">(.*/)</p>.*?releasetime>(>*?)</p>'
                       +'>*?integer">(>*?)</i>>*?fraction">(.*?)</i>.*?</dd>',re.S)
    items = re.findall(pattern,html)
    for item in items:
        yield {
            'index': item[0],
            'image': item[1],
            'title': item[2],
            'actor': item[3].strip()[3:],
            'time': item[4].strip()[5:],
            'score': item[5]+item[6]
        }
def write_to_file(content):
    with open('result.txt','a',encoding='utf-8')as f:
        f.write(json.dumps(content,ensure_ascii=False)+'\n')
        f.close()
def main():
    url='http://maoyan.com/board/4?'
    html=get_one_page(url)
    for item in parse_one_page(html):
        print (item)
        write_to_file(item)
if __name__ == '__main__':
    main()
4.使用循环爬取多个页面1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48import json
import requests
import re
from multiprocessing import Pool
from requests.exceptions import RequestException
def get_one_page(url):
    try:
        response=requests.get(url)
        if response.status_code==200:
           return response.text
        return None
    except RequestException:
        return None
def parse_one_page(html):
    pattern=re.compile('<dd>.*?board-index.*?>(\d*)</i>.*?data-src="(.*?)".*?name"><a'
                       +'.*?>(.*?)</a>.*?star">(.*/)</p>.*?releasetime>(>*?)</p>'
                       +'>*?integer">(>*?)</i>>*?fraction">(.*?)</i>.*?</dd>',re.S)
    items = re.findall(pattern,html)
    for item in items:
        yield {
            'index': item[0],
            'image': item[1],
            'title': item[2],
            'actor': item[3].strip()[3:],
            'time': item[4].strip()[5:],
            'score': item[5]+item[6]
        }
def write_to_file(content):
    with open('result.txt','a',encoding='utf-8')as f:
        f.write(json.dumps(content,ensure_ascii=False)+'\n')
        f.close()
def main(offset):
    url='http://maoyan.com/board/4?offset='+str(offset)
    html=get_one_page(url)
    for item in parse_one_page(html):
        print (item)
        write_to_file(item)
if __name__ == '__main__':
    for i in range(10):
       main(i*10)
   ##多进程实现
   ##pool=Pool()
   ##pool.map(main,[i*10 for i in range(10)])