Home

Tags

Асинхронный загрузчик сайтов на tornado и curl

2012-01-31 python tornado internet async

Пример асинхронного загрузчика сайтов на tornado и curl.
Пример для python2.7

# coding: utf8

import tornado
from tornado import httpclient
from tornado import gen
from functools import partial
import Queue

gloop = tornado.ioloop.IOLoop.instance()
qinput = Queue.Queue()
process_count = 0

process_max = 10    # maximum count of query for one moment

# fill queue
for i in xrange(100):
    qinput.put('http://www.wikipedia.org/')

def data_process(data):
    # method for process data
    pass

@gen.engine
def process(url):
    global process_count, worker
    try:
        http_client = httpclient.AsyncHTTPClient()
        response = yield gen.Task(http_client.fetch, str(url))
        if response.error: raise Exception(response.error)
        data_process(response.body)
    except Exception as e:
        print e
    process_count -= 1
    gloop.add_callback(worker)

def worker():
    global gloop, process_count, process_max
    print '# %d / %d (%d)' % (process_count, process_max, qinput.qsize())
    while process_count < process_max:
        if qinput.empty(): break
        url = qinput.get_nowait()
        process_count += 1
        gloop.add_callback(partial(process, url))
    if qinput.empty():
        if not process_count: gloop.stop()

print 'start'
gloop.add_callback(worker)
tornado.httpclient.AsyncHTTPClient.configure("tornado.curl_httpclient.CurlAsyncHTTPClient")
gloop.start()
print 'finish'


Параметры которые можно указать при запросе к http_client.fetch
method="GET", headers=None, body=None,
auth_username=None, auth_password=None,
connect_timeout=20.0, request_timeout=20.0,
if_modified_since=None, follow_redirects=True,
max_redirects=5, user_agent=None, use_gzip=True,
network_interface=None, streaming_callback=None,
header_callback=None, prepare_curl_callback=None,
proxy_host=None, proxy_port=None, proxy_username=None,
proxy_password='', allow_nonstandard_methods=False,
validate_cert=True, ca_certs=None,
allow_ipv6=None,
client_key=None, client_cert=None