简单爬虫

《网络搜索引擎》课上老师举的一个简单的python写的爬虫例子,没有用正则表达式,用的是xpath来摘取网页中的内容。

代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
#encoding=utf-8
from lxml import html
# These are the xpaths we determined from snooping
next_button_xpath = "//a[@id='key_nextpage']/@href"
headline_xpath = "//div[@class='picbox']/dl/dt/a/text()"

# We'll use sleep to add some time in between requests so that we're not bombarding Gawker's server too hard.
from time import sleep

# Now we'll fill this list of gawker titles by starting at the lading page and following "More Stories" links
titles = []
base_url = 'http://www.mtime.com/hotest/{}'
next_page = "http://www.mtime.com/hotest/"
while len(titles)<50 and next_page:
dom = html.parse(next_page)
headlines = dom.xpath(headline_xpath)
print "Retrieved {} titles from url: {}".format(len(headlines),next_page)
titles += headlines
next_pages = dom.xpath(next_button_xpath)
if next_pages:
next_page = base_url.format(next_pages[0])
else:
print "No next button found"
next_page = None
sleep(3)

with open('mtime_titles.txt','wb') as out:
out.write('\n'.join(titles).encode('utf-8'))
with open('mtime_titles.txt')as f:
titles_ = f.readlines()

print "Well, we got {} Hot Movies!".format(len(titles_))
for title in titles[:15]:
print title