Scrape the web§
speedily, reliably, and simply with scrapy
Asheesh Laroia
speedily, reliably, and simply with scrapy
Asheesh Laroia
(thanks)
 
 
 
 
>>> # get a web page
>>> page = urllib2.urlopen('http://oscon.com/').read()
 
>>> # get a web page
>>> page = urllib2.urlopen('http://oscon.com/').read()
>>> # parse it
>>> soup = BeautifulSoup.BeautifulSoup(page)
 
>>> # get a web page
>>> page = urllib2.urlopen('http://oscon.com/').read()
>>> # parse it
>>> soup = BeautifulSoup.BeautifulSoup(page)
>>> # find element we want
>>> matches = soup('div', {'id': 'location_place'})
 
>>> # get a web page
>>> page = urllib2.urlopen('http://oscon.com/').read()
>>> # parse it
>>> soup = BeautifulSoup.BeautifulSoup(page)
>>> # find element we want
>>> matches = soup('div', {'id': 'location_place'})
Finish extraction and save:
>>> # pull out text
>>> first = matches[0]
>>> date_range = r[0].find(text=True)
>>> print date_range
u'July 22-26, 2013'
>>> # store results somehow
>>> save_results({'conference': 'oscon', 'date_range': date_range})
>>> # get a web page
>>> page = urllib2.urlopen('http://oscon.com/').read()
This bloc
>>> # get a web page
>>> page = urllib2.urlopen('http://oscon.com/').read()
This blocks until the remote site responds.
>>> # get a web page
>>> page = urllib2.urlopen('http://oscon.com/').read()
This blocks until the remote site responds.
Must test online.
>>> # get a web page
>>> page = urllib2.urlopen('http://oscon.com/').read()
This blocks until the remote site responds.
Must test online.
If this fails, the app crashes.
>>> # pull out text
>>> first = matches[0]
If this fails, the app crashes.
>>> # find element we want
>>> matches = soup('div', {'id': 'location_place'})
That's just a CSS selector!
>>> # store results somehow
>>> save_results({'conference': 'oscon', 'date_range': date_range})
No clarity about data format. Code evolves!
Task: Get a list of speakers
Task: Get a list of speakers
SCHED_PAGE='https://us.pycon.org/2013/schedule/'
CSS and XPath
>>> import cssselect
>>> cssselect.HTMLTranslator().css_to_xpath('span.speaker')
u"descendant-or-self::span[@class and contains(concat(' ', normalize-space(@class), ' '), ' speaker ')]"
Task: Get a list of speakers
SCHED_PAGE='https://us.pycon.org/2013/schedule/'
import requests
import lxml.html
def main():
    data = requests.get(SCHED_PAGE)
    parsed = lxml.html.fromstring(data.content)
    for speaker in parsed.cssselect('span.speaker'):
        print speaker.text_content()
Why: Separate handling from retrieving
SCHED_PAGE='https://us.pycon.org/2013/schedule/'
import requests
import lxml.html
def main():
    data = requests.get(SCHED_PAGE)
    parsed = lxml.html.fromstring(data.content)
    for speaker in parsed.cssselect('span.speaker'):
        print speaker.text_content()
    #   ↑
Why: Separate handling from retrieving
SCHED_PAGE='https://us.pycon.org/2013/schedule/'
import requests
import lxml.html
def main():
    data = requests.get(SCHED_PAGE)
    parsed = lxml.html.fromstring(data.content)
    for speaker in parsed.cssselect('span.speaker'):
        print speaker.text_content()
    #   ↑
UnicodeEncodeError: 'ascii' codec can't encode character u'\xe9' in position 0: ordinal not in range(128)
How: Separate handling from retrieving
SCHED_PAGE='https://us.pycon.org/2013/schedule/'
import requests
import lxml.html
def get_data():
    data = requests.get(SCHED_PAGE)
    parsed = lxml.html.fromstring(data.content)
    data = []
    for speaker in parsed.cssselect('span.speaker'):
         data.append(speaker.text_content())
    return data
Why: Clarify the fields you are retrieving
SCHED_PAGE='https://us.pycon.org/2013/schedule/'
import requests
import lxml.html
def get_data():
    data = requests.get(SCHED_PAGE)
    parsed = lxml.html.fromstring(data.content)
    data = []
    for speaker in parsed.cssselect('span.speaker'):
         datum = {}
         datum['speaker_name'] = speaker.text_content()
         datum['preso_title'] = _ # FIXME
    return data
Why: Clarify the fields you are retrieving
SCHED_PAGE='https://us.pycon.org/2013/schedule/'
import requests
import lxml.html
def get_data():
    data = requests.get(SCHED_PAGE)
    parsed = lxml.html.fromstring(data.content)
    data = []
    for speaker in parsed.cssselect('span.speaker'):
         datum = {}
         datum['speaker_name'] = speaker.text_content()
         datum['preso_title'] = _ # FIXME
    return data # ↑
def handle_datum(datum):
    print datum['title'], 'by', datum['speaker_name']
#                ↑
class PyConPreso(scrapy.item.Item):
    author = Field()
    preso = Field()
class PyConPreso(scrapy.item.Item):
    author = Field()
    preso = Field()
# Similar to...
{'author': _,
 'preso':  _}
class PyConPreso(scrapy.item.Item):
    author = Field()
    preso = Field()
# Similar to...
{'author': _,
 'preso':  _}
>>> p['title'] = 'Asheesh'
KeyError: 'PyConPreso does not support field: title'
def get_data():
    data = requests.get(SCHED_PAGE)
    parsed = lxml.html.fromstring(data.content)
    data = []
    for speaker in parsed.cssselect('span.speaker'):
        author = _ # ...
        preso_title = _ # ...
        item = PyConPreso(
            author=author,
            preso=preso_title)
        out_data.append(item)
    return out_data
import lxml.html
START_URL = '...'
class PyConSiteSpider(BaseSpider):
    start_urls = [START_URL]
    def parse(self, response):
        parsed = lxml.html.fromstring(
                          response.body_as_unicode)
        slots = parsed.cssselect('span.speaker')
        results = []
        for speaker in speakers:
            author = _ # placeholder
            preso = _  # placeholder
            results.append(PyConPreso(
                    author=author, preso=preso))
        return results
import lxml.html
START_URL = '...'
class PyConSiteSpider(BaseSpider):
    start_urls = [START_URL]
    def parse(self, response):
        parsed = lxml.html.fromstring(
                          response.body_as_unicode)
        slots = parsed.cssselect('span.speaker')
        for speaker in speakers:
            author = _ # placeholder
            preso = _  # placeholder
            yield PyConPreso(
                    author=author, preso=preso)
$ scrapy runspider your_spider.py
$ scrapy runspider your_spider.py
2013-03-12 18:04:07-0700 [Demo] DEBUG: Crawled (200) <GET ...> (referer: None)
2013-03-12 18:04:07-0700 [Demo] DEBUG: Scraped from <200 ...>
{}
2013-03-12 18:04:07-0700 [Demo] INFO: Closing spider (finished)
2013-03-12 18:04:07-0700 [Demo] INFO: Dumping spider stats:
{'downloader/request_bytes': 513,
'downloader/request_count': 2,
'downloader/request_method_count/GET': 2,
'downloader/response_bytes': 75142,
'downloader/response_count': 2,
'downloader/response_status_count/200': 1,
'downloader/response_status_count/301': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2013, 3, 13, 1, 4, 7, 567078),
'item_scraped_count': 1,
'scheduler/memory_enqueued': 2,
'start_time': datetime.datetime(2013, 3, 13, 1, 4, 5, 144944)}
2013-03-12 18:04:07-0700 [Demo] INFO: Spider closed (finished)
2013-03-12 18:04:07-0700 [scrapy] INFO: Dumping global stats:
{'memusage/max': 95105024, 'memusage/startup': 95105024}
$ scrapy runspider your_spider.py -L ERROR
$
$ scrapy runspider your_spider.py -s FEED_URI=myfile.out
$
 
 
>>> 'Pablo Hoffman' > 'Asheesh Laroia'
True
$ scrapy startproject tutorial
$ scrapy startproject tutorial
creates
tutorial/
    scrapy.cfg
    tutorial/
        __init__.py
        items.py
        pipelines.py
        settings.py
        spiders/
            __init__.py
 
$ scrapy runspider your_spider.py &
$ telnet localhost 6023
$ scrapy runspider your_spider.py &
$ telnet localhost 6023
Gives
>>> est()
Execution engine status
time()-engine.start_time              : 21.3188259602
engine.is_idle()                      : False
…
$ scrapy runspider your_spider.py &
$ telnet localhost 6023
Gives
>>> est()
Execution engine status
time()-engine.start_time              : 21.3188259602
engine.is_idle()                      : False
…
>>> import os; os.system('eject')
0
>>> # Hmm.
$ scrapy runspider your_spider.py -s TELNETCONSOLE_ENABLED=0 -s WEBSERVICE_ENABLED=0
$ scrapy runspider your_spider.py -s TELNETCONSOLE_ENABLED=0 -s WEBSERVICE_ENABLED=0
Semi-complex integration with other pieces of code.
 
def parse(self, response):
    # ...
    for speaker in speakers:
        partial_item = PyConPreso(author=author)
        # need more data!
def parse(self, response):
    # ...
    for speaker in speakers:
        partial_item = PyConPreso(author=author)
        # need more data!
        # ...
        request = scrapy.http.Request(other_url)
def parse(self, response):
    # ...
    for speaker in speakers:
        partial_item = PyConPreso(author=author)
        # need more data!
        # ...
        request = scrapy.http.Request(other_url)
Relevant snippet:
>>> import urlparse
>>> urlparse.urljoin('http://example.com/my/site', '/newpath')
'http://example.com/newpath'
>>> urlparse.urljoin('http://example.com/my/site', 'subpath')
'http://example.com/my/subpath'
def parse(self, response):
    # ...
    for speaker in speakers:
        partial_item = PyConPreso(author=author)
        # need more data!
        # ...
        request = scrapy.http.Request(other_url)
        request.meta['partial_item'] = partial_item
        yield request
def parse(self, response):
    # ...
    for speaker in speakers:
        partial_item = PyConPreso(author=author)
        # need more data!
        # ...
        request = scrapy.http.Request(other_url,
                            callback=extract_next_part)
        request.meta['partial_item'] = partial_item
        yield request
def extract_next_part(response):
    partial_item = response.meta['partial_item']
    # do some work...
    partial_item['preso'] = _
    yield partial_item # now not partial!
def parse(self, response):
    # ...
    for speaker in speakers:
        partial_item = PyConPreso(author=author)
        # need more data!
        # ...
        request = scrapy.http.Request(other_url,
                            callback=extract_next_part)
        request.meta['partial_item'] = partial_item
        yield request
def extract_next_part(response):
    partial_item = response.meta['partial_item']
    # do some work...
    partial_item['preso'] = _
    yield partial_item # now not partial!
Rule: Split the function if you need a new HTTP request.
- 26 hours
- 26 hours
- +1-10 MB * N workers
- 26 hours
- +1-10 MB * N workers
- N=200 simultaneous requests
- 1 hour 10 min
>>> p.author
'Asheesh Laroia, Jessica McKellar, Dana Bauer, Daniel Choi'
Traceback (most recent call last):
 ...
 File "/usr/lib/python2.7/urllib2.py", line 1181, in do_open
    raise URLError(err)
URLError: <urlopen error [Errno -2] Name or service not known>
Ran 1 test in 0.153s
FAILED (errors=1)
Traceback (most recent call last):
 ...
 File "/usr/lib/python2.7/urllib2.py", line 1181, in do_open
    raise URLError(err)
urllib2.HTTPError: HTTP Error 403: Exceeded query limit for API key
Ran 1 test in 0.153s
FAILED (errors=1)
Traceback (most recent call last):
 ...
 File "/usr/lib/python2.7/urllib2.py", line 1181, in do_open
    raise URLError(err)
URLError: <urlopen error [Errno 110] Connection timed out>
Ran 1 test in 127.255s
FAILED (errors=1)
Traceback (most recent call last):
 ...
 File "/usr/lib/python2.7/urllib2.py", line 1181, in do_open
    raise URLError(err)
URLError: <urlopen error [Errno 110] Connection timed out>
Ran 1 test in 127.255s
FAILED (errors=1)
mock.patch()?
 
class PyConSiteSpider(BaseSpider):
    def parse(self, response):
        # ...
        for speaker in speakers:
            # ...
            yield PyConPreso(
                    author=author, preso=preso)
 
class PyConSiteSpider(BaseSpider):
    def parse(self, response):
        # ...
        for speaker in speakers:
            # ...
            yield PyConPreso(
                    author=author, preso=preso)
test:
>>> spidey = PyConSiteSpider()
>>> results = spidey.parse(response)
class PyConSiteSpider(BaseSpider):
    def parse(self, response):
        # ...
        for speaker in speakers:
            # ...
            yield PyConPreso(
                    author=author, preso=preso)
test:
>>> spidey = PyConSiteSpider()
>>> canned_response = HtmlResponse(url='', body=open('saved-data.html').read())
>>> results = spidey.parse(canned_response)
>>> assert list(results) == [PyConPreso(author=a, preso=b), ...]
class PyConSiteSpider(BaseSpider):
    def parse(self, response):
        # ...
        for speaker in speakers:
            # ...
            yield PyConPreso(
                    author=author, preso=preso)
test:
def test_spider(self):
    expected = [PyConPreso(author=a, preso=b), ...]
    spidey = PyConSiteSpider()
    canned_response = HtmlResponse(url='', body=open('saved-data.html').read())
    results = list(spidey.parse(canned_response))
    self.assertEqual(expected, items)
 
def test_spider(self):
    url2filename = {'https://us.pycon.org/2013/schedule/':
                           'localcopy.html'}
    expected_data = [PyConPreso(author=a, preso=b), ...]
def test_spider(self):
    url2filename = {'https://us.pycon.org/2013/schedule/':
                           'localcopy.html'}
    expected_data = [PyConPreso(author=a, preso=b), ...]
    spidey = PyConSiteSpider()
    request_iterable = spider.start_requests()
def test_spider(self):
    url2filename = {'https://us.pycon.org/2013/schedule/':
                           'localcopy.html'}
    expected_data = [PyConPreso(author=a, preso=b), ...]
    spidey = PyConSiteSpider()
    request_iterable = spider.start_requests()
    ar = autoresponse.Autoresponder(
             url2filename=url2filename,
             url2errors={})
    items = ar.respond_recursively(request_iterable)
def test_spider(self):
    url2filename = {'https://us.pycon.org/2013/schedule/':
                           'localcopy.html'}
    expected_data = [PyConPreso(author=a, preso=b), ...]
    spidey = PyConSiteSpider()
    request_iterable = spider.start_requests()
    ar = autoresponse.Autoresponder(
             url2filename=url2filename,
             url2errors={})
    items = ar.respond_recursively(request_iterable)
    self.assertEqual(expected, items)
>>> import spidermonkey
>>> r = spidermonkey.Runtime()
>>> import spidermonkey
>>> r = spidermonkey.Runtime()
>>> ctx = r.new_context()
>>> import spidermonkey
>>> r = spidermonkey.Runtime()
>>> ctx = r.new_context()
>>> ctx.execute("{} + []")
0
>>> js_src = '''function (x) { return 3 + x; }'''
>>> r = spidermonkey.Runtime()
>>> ctx = r.new_context()
>>> ctx.execute("{} + []")
0
>>> js_fn = cx.execute(js_src)
>>> type(js_fn)
<type 'spidermonkey.Function'>
>>> js_src = '''function (x) { return 3 + x; }'''
>>> r = spidermonkey.Runtime()
>>> ctx = r.new_context()
>>> ctx.execute("{} + []")
0
>>> js_fn = ctx.execute(js_src)
>>> js_fn(3)
6
>>> js_src = '''function (x) { return 3 + x; }'''
>>> r = spidermonkey.Runtime()
>>> ctx = r.new_context()
>>> js_fn = ctx.execute(js_src)
>>> type(js_fn)
<type 'spidermonkey.Function'>
>>> js_fn(3)
6
Get your source, e.g.
def parse(self, response):
   # to get a tag...
   script_content = doc.xpath('//script')[0].text_content()
Also works for non-anonymous functions:
>>> js_src = '''function add_three(x) { return 3 + x; }'''
>>> r = spidermonkey.Runtime()
>>> ctx = r.new_context()
>>> js_fn = ctx.execute(js_src)("add_three")
>>> type(js_fn)
<type 'spidermonkey.Function'>
>>> js_fn(3)
6
 
 
import selenium
class MySpider(BaseSpider):
    def __init__(self):
        self.browser = selenium.selenium(...) # configure
        self.browser.start() # synchronously launch
    def parse(self, response):
        self.browser.open(response.url) # GET by browser
        self.browser.select('//ul') # in-browser XPath
Also look for: phantompy, ghost.py, zombie, headless webkit
class WikiImageSpider(BaseSpider):
    START_URLS = ['http://en.wikipedia.org/w/api.php?action=query&titles=San_Francisco&prop=images&imlimit=20&format=json']
    def parse(self, response):
        results = json.loads(response.body_as_unicode)
        for image in results['query']['pages']['images']:
             item = WikipediaImage(_) # ...
             yield WikipediaImage
class WikiImageSpider(BaseSpider):
    START_URLS = ['http://en.wikipedia.org/w/api.php?action=query&titles=San_Francisco&prop=images&imlimit=20&format=json']
    def parse(self, response):
        results = json.loads(response.body_as_unicode)
        for image in results['query']['pages']['images']:
             item = WikipediaImage(_) # ...
             yield WikipediaImage
        if results['query-continue']['images']:
            new_url = response.url + _ # ...
            yield scrapy.http.Request(new_url, callback=self.parse)
class WikiImageSpider(BaseSpider):
    START_URLS = ['http://en.wikipedia.org/w/api.php?action=query&titles=San_Francisco&prop=images&imlimit=20&format=json']
    def parse(self, response):
        results = json.loads(response.body_as_unicode)
        for image in results['query']['pages']['images']:
             item = WikipediaImage(_) # ...
             yield WikipediaImage
        if results['query-continue']['images']:
            new_url = response.url + _ # ...
            yield scrapy.http.Request(new_url, callback=self.parse)
class WikiImageSpider(BaseSpider):
    START_URLS = ['http://en.wikipedia.org/w/api.php?action=query&titles=San_Francisco&prop=images&imlimit=20&format=json']
    def parse(self, response):
        results = json.loads(response.body_as_unicode)
        for image in results['query']['pages']['images']:
             item = WikipediaImage(_) # ...
             yield WikipediaImage
        if results['query-continue']['images']:
            new_url = response.url + _ # ...
            yield scrapy.http.Request(new_url, callback=self.parse)
 
 
Asheesh Laroia scrapy-talk.asheesh.org