Scrape the web§
speedily, reliably, and simply with scrapy
Asheesh Laroia
speedily, reliably, and simply with scrapy
Asheesh Laroia
(thanks)
>>> # get a web page
>>> page = urllib2.urlopen('http://oscon.com/').read()
>>> # get a web page
>>> page = urllib2.urlopen('http://oscon.com/').read()
>>> # parse it
>>> soup = BeautifulSoup.BeautifulSoup(page)
>>> # get a web page
>>> page = urllib2.urlopen('http://oscon.com/').read()
>>> # parse it
>>> soup = BeautifulSoup.BeautifulSoup(page)
>>> # find element we want
>>> matches = soup('div', {'id': 'location_place'})
>>> # get a web page
>>> page = urllib2.urlopen('http://oscon.com/').read()
>>> # parse it
>>> soup = BeautifulSoup.BeautifulSoup(page)
>>> # find element we want
>>> matches = soup('div', {'id': 'location_place'})
Finish extraction and save:
>>> # pull out text
>>> first = matches[0]
>>> date_range = r[0].find(text=True)
>>> print date_range
u'July 22-26, 2013'
>>> # store results somehow
>>> save_results({'conference': 'oscon', 'date_range': date_range})
>>> # get a web page
>>> page = urllib2.urlopen('http://oscon.com/').read()
This bloc
>>> # get a web page
>>> page = urllib2.urlopen('http://oscon.com/').read()
This blocks until the remote site responds.
>>> # get a web page
>>> page = urllib2.urlopen('http://oscon.com/').read()
This blocks until the remote site responds.
Must test online.
>>> # get a web page
>>> page = urllib2.urlopen('http://oscon.com/').read()
This blocks until the remote site responds.
Must test online.
If this fails, the app crashes.
>>> # pull out text
>>> first = matches[0]
If this fails, the app crashes.
>>> # find element we want
>>> matches = soup('div', {'id': 'location_place'})
That's just a CSS selector!
>>> # store results somehow
>>> save_results({'conference': 'oscon', 'date_range': date_range})
No clarity about data format. Code evolves!
Task: Get a list of speakers
Task: Get a list of speakers
SCHED_PAGE='https://us.pycon.org/2013/schedule/'
CSS and XPath
>>> import cssselect
>>> cssselect.HTMLTranslator().css_to_xpath('span.speaker')
u"descendant-or-self::span[@class and contains(concat(' ', normalize-space(@class), ' '), ' speaker ')]"
Task: Get a list of speakers
SCHED_PAGE='https://us.pycon.org/2013/schedule/'
import requests
import lxml.html
def main():
data = requests.get(SCHED_PAGE)
parsed = lxml.html.fromstring(data.content)
for speaker in parsed.cssselect('span.speaker'):
print speaker.text_content()
Why: Separate handling from retrieving
SCHED_PAGE='https://us.pycon.org/2013/schedule/'
import requests
import lxml.html
def main():
data = requests.get(SCHED_PAGE)
parsed = lxml.html.fromstring(data.content)
for speaker in parsed.cssselect('span.speaker'):
print speaker.text_content()
# ↑
Why: Separate handling from retrieving
SCHED_PAGE='https://us.pycon.org/2013/schedule/'
import requests
import lxml.html
def main():
data = requests.get(SCHED_PAGE)
parsed = lxml.html.fromstring(data.content)
for speaker in parsed.cssselect('span.speaker'):
print speaker.text_content()
# ↑
UnicodeEncodeError: 'ascii' codec can't encode character u'\xe9' in position 0: ordinal not in range(128)
How: Separate handling from retrieving
SCHED_PAGE='https://us.pycon.org/2013/schedule/'
import requests
import lxml.html
def get_data():
data = requests.get(SCHED_PAGE)
parsed = lxml.html.fromstring(data.content)
data = []
for speaker in parsed.cssselect('span.speaker'):
data.append(speaker.text_content())
return data
Why: Clarify the fields you are retrieving
SCHED_PAGE='https://us.pycon.org/2013/schedule/'
import requests
import lxml.html
def get_data():
data = requests.get(SCHED_PAGE)
parsed = lxml.html.fromstring(data.content)
data = []
for speaker in parsed.cssselect('span.speaker'):
datum = {}
datum['speaker_name'] = speaker.text_content()
datum['preso_title'] = _ # FIXME
return data
Why: Clarify the fields you are retrieving
SCHED_PAGE='https://us.pycon.org/2013/schedule/'
import requests
import lxml.html
def get_data():
data = requests.get(SCHED_PAGE)
parsed = lxml.html.fromstring(data.content)
data = []
for speaker in parsed.cssselect('span.speaker'):
datum = {}
datum['speaker_name'] = speaker.text_content()
datum['preso_title'] = _ # FIXME
return data # ↑
def handle_datum(datum):
print datum['title'], 'by', datum['speaker_name']
# ↑
class PyConPreso(scrapy.item.Item):
author = Field()
preso = Field()
class PyConPreso(scrapy.item.Item):
author = Field()
preso = Field()
# Similar to...
{'author': _,
'preso': _}
class PyConPreso(scrapy.item.Item):
author = Field()
preso = Field()
# Similar to...
{'author': _,
'preso': _}
>>> p['title'] = 'Asheesh'
KeyError: 'PyConPreso does not support field: title'
def get_data():
data = requests.get(SCHED_PAGE)
parsed = lxml.html.fromstring(data.content)
data = []
for speaker in parsed.cssselect('span.speaker'):
author = _ # ...
preso_title = _ # ...
item = PyConPreso(
author=author,
preso=preso_title)
out_data.append(item)
return out_data
import lxml.html
START_URL = '...'
class PyConSiteSpider(BaseSpider):
start_urls = [START_URL]
def parse(self, response):
parsed = lxml.html.fromstring(
response.body_as_unicode)
slots = parsed.cssselect('span.speaker')
results = []
for speaker in speakers:
author = _ # placeholder
preso = _ # placeholder
results.append(PyConPreso(
author=author, preso=preso))
return results
import lxml.html
START_URL = '...'
class PyConSiteSpider(BaseSpider):
start_urls = [START_URL]
def parse(self, response):
parsed = lxml.html.fromstring(
response.body_as_unicode)
slots = parsed.cssselect('span.speaker')
for speaker in speakers:
author = _ # placeholder
preso = _ # placeholder
yield PyConPreso(
author=author, preso=preso)
$ scrapy runspider your_spider.py
$ scrapy runspider your_spider.py
2013-03-12 18:04:07-0700 [Demo] DEBUG: Crawled (200) <GET ...> (referer: None)
2013-03-12 18:04:07-0700 [Demo] DEBUG: Scraped from <200 ...>
{}
2013-03-12 18:04:07-0700 [Demo] INFO: Closing spider (finished)
2013-03-12 18:04:07-0700 [Demo] INFO: Dumping spider stats:
{'downloader/request_bytes': 513,
'downloader/request_count': 2,
'downloader/request_method_count/GET': 2,
'downloader/response_bytes': 75142,
'downloader/response_count': 2,
'downloader/response_status_count/200': 1,
'downloader/response_status_count/301': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2013, 3, 13, 1, 4, 7, 567078),
'item_scraped_count': 1,
'scheduler/memory_enqueued': 2,
'start_time': datetime.datetime(2013, 3, 13, 1, 4, 5, 144944)}
2013-03-12 18:04:07-0700 [Demo] INFO: Spider closed (finished)
2013-03-12 18:04:07-0700 [scrapy] INFO: Dumping global stats:
{'memusage/max': 95105024, 'memusage/startup': 95105024}
$ scrapy runspider your_spider.py -L ERROR
$
$ scrapy runspider your_spider.py -s FEED_URI=myfile.out
$
>>> 'Pablo Hoffman' > 'Asheesh Laroia'
True
$ scrapy startproject tutorial
$ scrapy startproject tutorial
creates
tutorial/
scrapy.cfg
tutorial/
__init__.py
items.py
pipelines.py
settings.py
spiders/
__init__.py
$ scrapy runspider your_spider.py &
$ telnet localhost 6023
$ scrapy runspider your_spider.py &
$ telnet localhost 6023
Gives
>>> est()
Execution engine status
time()-engine.start_time : 21.3188259602
engine.is_idle() : False
…
$ scrapy runspider your_spider.py &
$ telnet localhost 6023
Gives
>>> est()
Execution engine status
time()-engine.start_time : 21.3188259602
engine.is_idle() : False
…
>>> import os; os.system('eject')
0
>>> # Hmm.
$ scrapy runspider your_spider.py -s TELNETCONSOLE_ENABLED=0 -s WEBSERVICE_ENABLED=0
$ scrapy runspider your_spider.py -s TELNETCONSOLE_ENABLED=0 -s WEBSERVICE_ENABLED=0
Semi-complex integration with other pieces of code.
def parse(self, response):
# ...
for speaker in speakers:
partial_item = PyConPreso(author=author)
# need more data!
def parse(self, response):
# ...
for speaker in speakers:
partial_item = PyConPreso(author=author)
# need more data!
# ...
request = scrapy.http.Request(other_url)
def parse(self, response):
# ...
for speaker in speakers:
partial_item = PyConPreso(author=author)
# need more data!
# ...
request = scrapy.http.Request(other_url)
Relevant snippet:
>>> import urlparse
>>> urlparse.urljoin('http://example.com/my/site', '/newpath')
'http://example.com/newpath'
>>> urlparse.urljoin('http://example.com/my/site', 'subpath')
'http://example.com/my/subpath'
def parse(self, response):
# ...
for speaker in speakers:
partial_item = PyConPreso(author=author)
# need more data!
# ...
request = scrapy.http.Request(other_url)
request.meta['partial_item'] = partial_item
yield request
def parse(self, response):
# ...
for speaker in speakers:
partial_item = PyConPreso(author=author)
# need more data!
# ...
request = scrapy.http.Request(other_url,
callback=extract_next_part)
request.meta['partial_item'] = partial_item
yield request
def extract_next_part(response):
partial_item = response.meta['partial_item']
# do some work...
partial_item['preso'] = _
yield partial_item # now not partial!
def parse(self, response):
# ...
for speaker in speakers:
partial_item = PyConPreso(author=author)
# need more data!
# ...
request = scrapy.http.Request(other_url,
callback=extract_next_part)
request.meta['partial_item'] = partial_item
yield request
def extract_next_part(response):
partial_item = response.meta['partial_item']
# do some work...
partial_item['preso'] = _
yield partial_item # now not partial!
Rule: Split the function if you need a new HTTP request.
- 26 hours
- 26 hours
- +1-10 MB * N workers
- 26 hours
- +1-10 MB * N workers
- N=200 simultaneous requests
- 1 hour 10 min
>>> p.author
'Asheesh Laroia, Jessica McKellar, Dana Bauer, Daniel Choi'
Traceback (most recent call last):
...
File "/usr/lib/python2.7/urllib2.py", line 1181, in do_open
raise URLError(err)
URLError: <urlopen error [Errno -2] Name or service not known>
Ran 1 test in 0.153s
FAILED (errors=1)
Traceback (most recent call last):
...
File "/usr/lib/python2.7/urllib2.py", line 1181, in do_open
raise URLError(err)
urllib2.HTTPError: HTTP Error 403: Exceeded query limit for API key
Ran 1 test in 0.153s
FAILED (errors=1)
Traceback (most recent call last):
...
File "/usr/lib/python2.7/urllib2.py", line 1181, in do_open
raise URLError(err)
URLError: <urlopen error [Errno 110] Connection timed out>
Ran 1 test in 127.255s
FAILED (errors=1)
Traceback (most recent call last):
...
File "/usr/lib/python2.7/urllib2.py", line 1181, in do_open
raise URLError(err)
URLError: <urlopen error [Errno 110] Connection timed out>
Ran 1 test in 127.255s
FAILED (errors=1)
mock.patch()?
class PyConSiteSpider(BaseSpider):
def parse(self, response):
# ...
for speaker in speakers:
# ...
yield PyConPreso(
author=author, preso=preso)
class PyConSiteSpider(BaseSpider):
def parse(self, response):
# ...
for speaker in speakers:
# ...
yield PyConPreso(
author=author, preso=preso)
test:
>>> spidey = PyConSiteSpider()
>>> results = spidey.parse(response)
class PyConSiteSpider(BaseSpider):
def parse(self, response):
# ...
for speaker in speakers:
# ...
yield PyConPreso(
author=author, preso=preso)
test:
>>> spidey = PyConSiteSpider()
>>> canned_response = HtmlResponse(url='', body=open('saved-data.html').read())
>>> results = spidey.parse(canned_response)
>>> assert list(results) == [PyConPreso(author=a, preso=b), ...]
class PyConSiteSpider(BaseSpider):
def parse(self, response):
# ...
for speaker in speakers:
# ...
yield PyConPreso(
author=author, preso=preso)
test:
def test_spider(self):
expected = [PyConPreso(author=a, preso=b), ...]
spidey = PyConSiteSpider()
canned_response = HtmlResponse(url='', body=open('saved-data.html').read())
results = list(spidey.parse(canned_response))
self.assertEqual(expected, items)
def test_spider(self):
url2filename = {'https://us.pycon.org/2013/schedule/':
'localcopy.html'}
expected_data = [PyConPreso(author=a, preso=b), ...]
def test_spider(self):
url2filename = {'https://us.pycon.org/2013/schedule/':
'localcopy.html'}
expected_data = [PyConPreso(author=a, preso=b), ...]
spidey = PyConSiteSpider()
request_iterable = spider.start_requests()
def test_spider(self):
url2filename = {'https://us.pycon.org/2013/schedule/':
'localcopy.html'}
expected_data = [PyConPreso(author=a, preso=b), ...]
spidey = PyConSiteSpider()
request_iterable = spider.start_requests()
ar = autoresponse.Autoresponder(
url2filename=url2filename,
url2errors={})
items = ar.respond_recursively(request_iterable)
def test_spider(self):
url2filename = {'https://us.pycon.org/2013/schedule/':
'localcopy.html'}
expected_data = [PyConPreso(author=a, preso=b), ...]
spidey = PyConSiteSpider()
request_iterable = spider.start_requests()
ar = autoresponse.Autoresponder(
url2filename=url2filename,
url2errors={})
items = ar.respond_recursively(request_iterable)
self.assertEqual(expected, items)
>>> import spidermonkey
>>> r = spidermonkey.Runtime()
>>> import spidermonkey
>>> r = spidermonkey.Runtime()
>>> ctx = r.new_context()
>>> import spidermonkey
>>> r = spidermonkey.Runtime()
>>> ctx = r.new_context()
>>> ctx.execute("{} + []")
0
>>> js_src = '''function (x) { return 3 + x; }'''
>>> r = spidermonkey.Runtime()
>>> ctx = r.new_context()
>>> ctx.execute("{} + []")
0
>>> js_fn = cx.execute(js_src)
>>> type(js_fn)
<type 'spidermonkey.Function'>
>>> js_src = '''function (x) { return 3 + x; }'''
>>> r = spidermonkey.Runtime()
>>> ctx = r.new_context()
>>> ctx.execute("{} + []")
0
>>> js_fn = ctx.execute(js_src)
>>> js_fn(3)
6
>>> js_src = '''function (x) { return 3 + x; }'''
>>> r = spidermonkey.Runtime()
>>> ctx = r.new_context()
>>> js_fn = ctx.execute(js_src)
>>> type(js_fn)
<type 'spidermonkey.Function'>
>>> js_fn(3)
6
Get your source, e.g.
def parse(self, response):
# to get a tag...
script_content = doc.xpath('//script')[0].text_content()
Also works for non-anonymous functions:
>>> js_src = '''function add_three(x) { return 3 + x; }'''
>>> r = spidermonkey.Runtime()
>>> ctx = r.new_context()
>>> js_fn = ctx.execute(js_src)("add_three")
>>> type(js_fn)
<type 'spidermonkey.Function'>
>>> js_fn(3)
6
import selenium
class MySpider(BaseSpider):
def __init__(self):
self.browser = selenium.selenium(...) # configure
self.browser.start() # synchronously launch
def parse(self, response):
self.browser.open(response.url) # GET by browser
self.browser.select('//ul') # in-browser XPath
Also look for: phantompy, ghost.py, zombie, headless webkit
class WikiImageSpider(BaseSpider):
START_URLS = ['http://en.wikipedia.org/w/api.php?action=query&titles=San_Francisco&prop=images&imlimit=20&format=json']
def parse(self, response):
results = json.loads(response.body_as_unicode)
for image in results['query']['pages']['images']:
item = WikipediaImage(_) # ...
yield WikipediaImage
class WikiImageSpider(BaseSpider):
START_URLS = ['http://en.wikipedia.org/w/api.php?action=query&titles=San_Francisco&prop=images&imlimit=20&format=json']
def parse(self, response):
results = json.loads(response.body_as_unicode)
for image in results['query']['pages']['images']:
item = WikipediaImage(_) # ...
yield WikipediaImage
if results['query-continue']['images']:
new_url = response.url + _ # ...
yield scrapy.http.Request(new_url, callback=self.parse)
class WikiImageSpider(BaseSpider):
START_URLS = ['http://en.wikipedia.org/w/api.php?action=query&titles=San_Francisco&prop=images&imlimit=20&format=json']
def parse(self, response):
results = json.loads(response.body_as_unicode)
for image in results['query']['pages']['images']:
item = WikipediaImage(_) # ...
yield WikipediaImage
if results['query-continue']['images']:
new_url = response.url + _ # ...
yield scrapy.http.Request(new_url, callback=self.parse)
class WikiImageSpider(BaseSpider):
START_URLS = ['http://en.wikipedia.org/w/api.php?action=query&titles=San_Francisco&prop=images&imlimit=20&format=json']
def parse(self, response):
results = json.loads(response.body_as_unicode)
for image in results['query']['pages']['images']:
item = WikipediaImage(_) # ...
yield WikipediaImage
if results['query-continue']['images']:
new_url = response.url + _ # ...
yield scrapy.http.Request(new_url, callback=self.parse)
Asheesh Laroia scrapy-talk.asheesh.org