elliotgao2 / gain Goto Github PK

Web crawling framework based on asyncio.

License: GNU General Public License v3.0

Python 100.00%

python crawler spider asyncio uvloop aiohttp

gain's Issues

Parser.py

class BaseParser(object):    
    def __init__(self, rule, item=None,attr='href'):
        self.rule = rule
        self.item = item
        self.parsing_urls = []
        self.pre_parse_urls = Queue()
        self.filter_urls = set()
        self.done_urls = []
        self.attr = attr    # hare

....ellipsis


class cssParser(BaseParser):
    def abstract_urls(self, html):
        urls = [pq(x).attr(self.attr) for x in pq(html)(self.rule)]
        return urls

eg:

from gain import Css, Item, Parser, Spider, cssParser
from pyquery import PyQuery as pq

class Post(Item):


    videoTitle = Css('div.ui-cnt ul.intro li h2 a.title')
    videoType = Css('.intro > li:nth-child(1) > p', process_func=lambda pqObj:' '.join([x.text for x in pq(pqObj[0])('a')]))
    videoAuthor = Css('.intro > li:nth-child(1) > p',process_func=lambda pqObj:' '.join([x.text for x in pq(pqObj[1])('a')]))
    videoNotes = Css('.intro > li:nth-child(1) > p',process_func=lambda pqObj:pq(pqObj[2]).text())
    videoLang = Css('.intro > li:nth-child(1) > p',process_func=lambda pqObj:pq(pqObj[3]).text())
    videoRegion = Css('.intro > li:nth-child(1) > p',process_func=lambda pqObj:pq(pqObj[4]).text())

    # title is List
    async def save(self):
        if hasattr(self,'videoTitle') and hasattr(self,'videoType') and hasattr(self,'videoAuthor') and hasattr(self,'videoNotes') and hasattr(self,'videoLang') and hasattr(self,'videoRegion'):
            print('片名：%s' % self.videoTitle)
            print('类型：%s' % self.videoType)
            print('主演：%s' % self.videoAuthor)
            print('%s' % self.videoNotes)
            print('%s' % self.videoLang)
            print('%s' % self.videoRegion)
            print('-------')


class MySpider(Spider):
    concurrency = 5
    encoding = 'gbk'
    headers = {'User-Agent': 'Google Spider'}
    start_url = r'http://www.xinxin46.com/L/lilunpian.html'
    parsers = [cssParser('.ui-pages a[href^="/L/lilunpian"]',attr='href'),
               cssParser('.primary-list li h5 a[href^="/V/"]',attr='href'),
               cssParser('.play-list a[href^="/player/"]',Post,attr='href'),
               ]


MySpider.run()

Repeated bug

Concurrency 50, duplicated links super (fried chicken many kind of)

Don't believe it, try it yourself

from gain import Css, Item, Parser, Spider, cssParser,Xpath
from pyquery import PyQuery as pq
import re
import requests

from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine
from sqlalchemy import Column, Integer, String


engine = create_engine('sqlite:////home/dde/test.db', echo=False)
Base = declarative_base()

class videoInfo(Base):
    __tablename__ = 'users'
    id = Column(Integer, primary_key=True)
    videoTitle = Column(String)
    videoType = Column(String)
    videoAuthor = Column(String)
    videoNotes = Column(String)
    videoLang = Column(String)
    videoRegion = Column(String)
    videoPlayPage = Column(String)
    videoPlayLink = Column(String)

Session = sessionmaker(bind=engine)
session = Session()
Base.metadata.create_all(engine)



class getVideoInfo(Item):

    def filterPlayLink(link):
        url = 'http://www.xinxin46.com%s' % link[0]
        content = requests.get(url).text
        playUrl = eval(re.findall(r'\[\[.*?\]\]\]', content)[0])[0][1]
        result = str()
        for x in playUrl:
            line,playUrl,player = x.split('$')
            result += 'player----{}----{}----{}\n'.format(player,line,playUrl)
        # result = re.findall(r'/player/.*?/', content)[0][1:-1]+'$$$$'+ result
        return result
    videoTitle = Css('div.ui-cnt ul.intro li h2 a.title')
    videoType = Css('.intro > li:nth-child(1)  p', process_func=lambda pqObj: ' '.join([pq(x).text() for x in pq(pqObj[0])('a')]) if len(pq(pqObj[0])('a'))>0 else pq(pqObj).text())
    videoAuthor = Css('.intro > li:nth-child(1)  p',process_func=lambda pqObj:' '.join([pq(x).text() for x in pq(pqObj[1])('a')]) if len(pq(pqObj[1])('a'))>0 else pq(pqObj).text())
    videoNotes = Css('.intro > li:nth-child(1)  p',process_func=lambda pqObj:pq(pqObj[2]).text())
    videoLang = Css('.intro > li:nth-child(1)  p',process_func=lambda pqObj:pq(pqObj[3]).text())
    videoRegion = Css('.intro > li:nth-child(1) p',process_func=lambda pqObj:pq(pqObj[4]).text())
    videoPlayPage = Css('.play-list li a[href^="/player/"]',process_func=lambda pqObj:'\n'.join(['link----'+pq(x).text()+'----' +pq(x).attr('href') for x in pqObj]))
    
    videoPlayLink = Xpath('/html/body/div[3]/div/div[1]/div[1]/script[1]/@src',process_func=filterPlayLink)

    async def save(self):
        if hasattr(self,'videoTitle')\
            and hasattr(self,'videoType')\
            and hasattr(self,'videoAuthor')\
            and hasattr(self,'videoNotes')\
            and hasattr(self,'videoLang')\
            and hasattr(self,'videoRegion')\
            and hasattr(self,'videoPlayPage')\
            and hasattr(self,'videoPlayLink'):
            """
            if self.videoPlayLink.find('qvod') >-1:
                return

            print('片名：%s' % self.videoTitle)
            print('类型：%s' % self.videoType)
            print('主演：%s' % self.videoAuthor)
            print('%s' % self.videoNotes)
            print('%s' % self.videoLang)
            print('%s' % self.videoRegion)
            print('%s' % self.videoPlayPage)
            print('%s' % self.videoPlayLink)
            print('-------')
            """
            global session
            addInfo = videoInfo(videoTitle=self.videoTitle,videoType=self.videoType,videoAuthor=self.videoAuthor,videoNotes=self.videoNotes,videoLang=self.videoLang,videoRegion=self.videoRegion,videoPlayPage=self.videoPlayPage,videoPlayLink=self.videoPlayLink)
            session.add(addInfo)
            session.commit()




class MySpider(Spider):
    concurrency = 50
    encoding = 'gbk'
    headers = {'User-Agent': 'Google Spider'}
    start_url = r'http://www.xinxin46.com/L/lilunpian.html'
    parsers = [cssParser('.ui-pages a[href^="/L/lilunpian"]',attr='href'),
               cssParser('.primary-list li h5 a[href^="/V/"]',attr='href'),
               cssParser('.play-list a[href^="/player/"]',getVideoInfo,attr='href'),
               ]


MySpider.run()

session.close()

'''
import requests

a= requests.get('http://www.xinxin46.com/player/baishilingyincangjurudepusuOLshimingantizhidenvhaiFSET680/index-0-0.html').text
print(pq(a)('script[src^="/playdata/"]'))
'''

Building 'pybloomfilter' extension on Windows.

error: Microsoft Visual C++ 14.0 is required. Get it with "Microsoft Visual C++ Build Tools": http://landinghub.visualstudio.com/visual-cpp-build-tools

Add hooks before download and after download.

我们想在下载前或者下载后加一些东西，比如保存整个url。或者针对不同的状态码做不同的处理，比如3xx的重定向，5xx时重试等。

What does this statement mean?

title = Css('.ph')，i don't know how to use this class，please help

TypeError: write() argument must be str, not dict

When I ran the Usage code in README.md, a TypeError occured which refers to this line :
await f.write(self.results)
Then I changed this line to await f.write(self.results['title']) and everything works just fine.
I noticed that in previous edition of this README file, when aiofile was not introduced, this part of code used this dict self.results the same way. So I'm not sure which is the right way to print the result.

Does it work on OSX?

The readme says:

pip install uvloop (Only linux)

Does it mean that gain can't be used on a Mac?

The ``sciencenet_spider.py`` example does not (seem to) work for python 3.6

I copied the examples/sciencenet_spider.py example and tried to run it using python 3.6 - but:

python sciencenet_spider.py
[2018:04:14 22:21:26] Spider started!
[2018:04:14 22:21:26] Using selector: KqueueSelector
[2018:04:14 22:21:26] Base url: http://blog.sciencenet.cn/
[2018:04:14 22:21:26] Item "Post": 0
[2018:04:14 22:21:26] Requests count: 0
[2018:04:14 22:21:26] Error count: 0
[2018:04:14 22:21:26] Time usage: 0:00:00.001127
[2018:04:14 22:21:26] Spider finished!
Traceback (most recent call last):
  File "sciencenet_spider.py", line 19, in <module>
    MySpider.run()
  File "/Users/endafarrell/anaconda/anaconda3/lib/python3.6/site-packages/gain/spider.py", line 52, in run
    loop.run_until_complete(cls.init_parse(semaphore))
  File "/Users/endafarrell/anaconda/anaconda3/lib/python3.6/asyncio/base_events.py", line 467, in run_until_complete
    return future.result()
  File "/Users/endafarrell/anaconda/anaconda3/lib/python3.6/site-packages/gain/spider.py", line 71, in init_parse
    with aiohttp.ClientSession() as session:
  File "/Users/endafarrell/anaconda/anaconda3/lib/python3.6/site-packages/aiohttp/client.py", line 746, in __enter__
    raise TypeError("Use async with instead")
TypeError: Use async with instead
sys:1: RuntimeWarning: coroutine 'Parser.task' was never awaited
[2018:04:14 22:21:26] Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x105b07cf8>

My python is

python
Python 3.6.4 |Anaconda, Inc.| (default, Jan 16 2018, 12:04:33)
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)] on darwin

and I have:

pip list | grep gain
gain                               0.1.4

I installed gain using:

pip install gain

Any ideas?

Some Suggestions

Spider class add cookies field cause some websites need login
In parser.py file,there is await item.save(), a function used to store information mostly in local file(user can override the function). As far as i'm concerned, code like

    async def save(self):
        with open('scrapinghub.txt', 'a+') as f:
            f.writelines(str(self.results) + '\n')

is blocking as local filesystem access is blocking.Therefore,the event loop(Thread) is blocking.
Especially when we select a MB size file and want to store in local file, it would slow the whole application.

So, It's that possible use aiofile(File support for asyncio,https://github.com/Tinche/aiofiles) or use loop.run_in_executor makes save funciton run in another thread when the file is large?

Add support to handle the value of each field of an item.

For example：

from gain import Css, Item, Parser, Spider


class Post(Item):
    title = Css('.entry-title')
    content = Css('.entry-content')

    async def save(self):
        with open('scrapinghub.txt', 'a+') as f:
            f.writelines(self.results['title'] + '\n')
 # Add function to handle value
class Post(Item):
    title = Css('.entry-title')
    content = Css('.entry-content')
    
    def clean_title(self,title):
        return title.strip()

    async def save(self):
        with open('scrapinghub.txt', 'a+') as f:
            f.writelines(self.results['title'] + '\n')

Then in https://github.com/gaojiuli/gain/blob/master/gain/item.py

class Item(metaclass=ItemType):
    def __init__(self, html):
        self.results = {}
        for name, selector in self.selectors.items():
            value = selector.parse_detail(html)
            # Add function to handle value
            get_field = getattr(self, 'clean_%s' % name, None)
            if get_field:
                value = get_field(value)
            if value is None:
                logger.error('Selector "{}" for {} was wrong, please check again'.format(selector.rule, name))
            else:
                self.results[name] = value

Add logo.

Need a cool logo meaning "efficient crawler".

aiofiles BUG

aio-libs/aiohttp#1175

SSL handshake failed on verifying the certificate

[2018:10:25 16:14:03] Spider started!
[2018:10:25 16:14:03] Base url: https://blog.scrapinghub.com/
[2018:10:25 16:14:04] SSL handshake failed on verifying the certificate
protocol: <uvloop.loop.SSLProtocol object at 0x10729acc0>
transport: <TCPTransport closed=False reading=False 0x7fe65248c048>
Traceback (most recent call last):
File "uvloop/sslproto.pyx", line 609, in uvloop.loop.SSLProtocol._on_handshake_complete
File "uvloop/sslproto.pyx", line 171, in uvloop.loop._SSLPipe.feed_ssldata
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/ssl.py", line 763, in do_handshake
self._sslobj.do_handshake()
ssl.SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1045)
[2018:10:25 16:14:04] SSL error errno:1 reason: CERTIFICATE_VERIFY_FAILED
protocol: <uvloop.loop.SSLProtocol object at 0x10729acc0>
transport: <TCPTransport closed=False reading=False 0x7fe65248c048>
Traceback (most recent call last):
File "uvloop/sslproto.pyx", line 504, in uvloop.loop.SSLProtocol.data_received
File "uvloop/sslproto.pyx", line 204, in uvloop.loop._SSLPipe.feed_ssldata
File "uvloop/sslproto.pyx", line 171, in uvloop.loop._SSLPipe.feed_ssldata
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/ssl.py", line 763, in do_handshake
self._sslobj.do_handshake()
ssl.SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1045)
[2018:10:25 16:14:05] SSL handshake failed on verifying the certificate
protocol: <uvloop.loop.SSLProtocol object at 0x10729ae80>
transport: <TCPTransport closed=False reading=False 0x7fe6549908b8>
Traceback (most recent call last):
File "uvloop/sslproto.pyx", line 609, in uvloop.loop.SSLProtocol._on_handshake_complete
File "uvloop/sslproto.pyx", line 171, in uvloop.loop._SSLPipe.feed_ssldata
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/ssl.py", line 763, in do_handshake
self._sslobj.do_handshake()
ssl.SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1045)
[2018:10:25 16:14:05] SSL error errno:1 reason: CERTIFICATE_VERIFY_FAILED
protocol: <uvloop.loop.SSLProtocol object at 0x10729ae80>
transport: <TCPTransport closed=False reading=False 0x7fe6549908b8>
Traceback (most recent call last):
File "uvloop/sslproto.pyx", line 504, in uvloop.loop.SSLProtocol.data_received
File "uvloop/sslproto.pyx", line 204, in uvloop.loop._SSLPipe.feed_ssldata
File "uvloop/sslproto.pyx", line 171, in uvloop.loop._SSLPipe.feed_ssldata
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/ssl.py", line 763, in do_handshake
self._sslobj.do_handshake()
ssl.SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1045)
[2018:10:25 16:14:05] SSL handshake failed on verifying the certificate
protocol: <uvloop.loop.SSLProtocol object at 0x10729aeb8>
transport: <TCPTransport closed=False reading=False 0x7fe652738418>
Traceback (most recent call last):
File "uvloop/sslproto.pyx", line 609, in uvloop.loop.SSLProtocol._on_handshake_complete
File "uvloop/sslproto.pyx", line 171, in uvloop.loop._SSLPipe.feed_ssldata
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/ssl.py", line 763, in do_handshake
self._sslobj.do_handshake()
ssl.SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1045)
[2018:10:25 16:14:05] SSL error errno:1 reason: CERTIFICATE_VERIFY_FAILED
protocol: <uvloop.loop.SSLProtocol object at 0x10729aeb8>
transport: <TCPTransport closed=False reading=False 0x7fe652738418>
Traceback (most recent call last):
File "uvloop/sslproto.pyx", line 504, in uvloop.loop.SSLProtocol.data_received
File "uvloop/sslproto.pyx", line 204, in uvloop.loop._SSLPipe.feed_ssldata
File "uvloop/sslproto.pyx", line 171, in uvloop.loop._SSLPipe.feed_ssldata
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/ssl.py", line 763, in do_handshake
self._sslobj.do_handshake()
ssl.SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1045)
[2018:10:25 16:14:05] SSL handshake failed on verifying the certificate
protocol: <uvloop.loop.SSLProtocol object at 0x1072d0080>
transport: <TCPTransport closed=False reading=False 0x7fe654994308>
Traceback (most recent call last):
File "uvloop/sslproto.pyx", line 609, in uvloop.loop.SSLProtocol._on_handshake_complete
File "uvloop/sslproto.pyx", line 171, in uvloop.loop._SSLPipe.feed_ssldata
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/ssl.py", line 763, in do_handshake
self._sslobj.do_handshake()
ssl.SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1045)
[2018:10:25 16:14:05] SSL error errno:1 reason: CERTIFICATE_VERIFY_FAILED
protocol: <uvloop.loop.SSLProtocol object at 0x1072d0080>
transport: <TCPTransport closed=False reading=False 0x7fe654994308>
Traceback (most recent call last):
File "uvloop/sslproto.pyx", line 504, in uvloop.loop.SSLProtocol.data_received
File "uvloop/sslproto.pyx", line 204, in uvloop.loop._SSLPipe.feed_ssldata
File "uvloop/sslproto.pyx", line 171, in uvloop.loop._SSLPipe.feed_ssldata
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/ssl.py", line 763, in do_handshake
self._sslobj.do_handshake()
ssl.SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1045)
[2018:10:25 16:14:06] SSL handshake failed on verifying the certificate
protocol: <uvloop.loop.SSLProtocol object at 0x1072d0208>
transport: <TCPTransport closed=False reading=False 0x7fe6527bef38>
Traceback (most recent call last):
File "uvloop/sslproto.pyx", line 609, in uvloop.loop.SSLProtocol._on_handshake_complete
File "uvloop/sslproto.pyx", line 171, in uvloop.loop._SSLPipe.feed_ssldata
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/ssl.py", line 763, in do_handshake
self._sslobj.do_handshake()
ssl.SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1045)
[2018:10:25 16:14:06] SSL error errno:1 reason: CERTIFICATE_VERIFY_FAILED
protocol: <uvloop.loop.SSLProtocol object at 0x1072d0208>
transport: <TCPTransport closed=False reading=False 0x7fe6527bef38>
Traceback (most recent call last):
File "uvloop/sslproto.pyx", line 504, in uvloop.loop.SSLProtocol.data_received
File "uvloop/sslproto.pyx", line 204, in uvloop.loop._SSLPipe.feed_ssldata
File "uvloop/sslproto.pyx", line 171, in uvloop.loop._SSLPipe.feed_ssldata
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/ssl.py", line 763, in do_handshake
self._sslobj.do_handshake()
ssl.SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1045)
[2018:10:25 16:14:06] Item "Post": 0
[2018:10:25 16:14:06] Requests count: 0
[2018:10:25 16:14:06] Error count: 0
[2018:10:25 16:14:06] Time usage: 0:00:03.345306
[2018:10:25 16:14:06] Spider finished!

Process finished with exit code 0

Add document's own parsing

Using Firefox 57, you can copy the XPath and CSS paths

selector.py

import re

from lxml import etree
from pyquery import PyQuery as pq


class Selector:
    def __init__(self, rule, attr=None,process_func=None):
        self.rule = rule
        self.attr = attr
        self.process_func = process_func

    def __str__(self):
        return '{}({})'.format(self.__class__.__name__, self.rule)

    def __repr__(self):
        return '{}({})'.format(self.__class__.__name__, self.rule)

    def parse_detail(self, html):
        raise NotImplementedError


class Css(Selector):
    def parse_detail(self, html):

        d = pq(html)

        if self.process_func:
            try:
                if self.rule != 'document':
                    d = d(self.rule)
                results = self.process_func(d)
            except IndexError:
                return None
            return results if results else None

        if self.attr is None:
            try:
                return d(self.rule)[0].text
            except IndexError:
                return None
        return d(self.rule)[0].attr(self.attr, None)


class Xpath(Selector):
    def parse_detail(self, html):
        d = etree.HTML(html)
        
        if self.process_func:
            try:
                if self.rule != 'document':
                    d = d.xpath(self.rule)
                results = self.process_func(d)
            except IndexError:
                return None
            return results if results else None

        try:
            if self.attr is None:
                return d.xpath(self.rule)[0].text
            return d.xpath(self.rule)[0].get(self.attr, None)
        except IndexError:
            return None


class Regex(Selector):
    def parse_detail(self, html):
        try:
            return re.findall(self.rule, html)[0]
        except IndexError:
            return None

test.py （The importance of processing functions）

In some cases, the creeper rules are complex and need to be resolved by themselves

from gain import Css, Item, Parser, Spider

class Post(Item):

    title = Css('html body div#content div.layout.fn-clear div#primary.mainbox.fn-left div.ui-box.l-h div.ui-cnt ul.primary-list.min-video-list.fn-clear li h5 a', process_func=lambda pq:[x.text for x in pq])
    # title is List
    async def save(self):
        if hasattr(self,'title'):
            # title is List
            for x in self.title:
                print(x)
        else:
            print('error')

class MySpider(Spider):
    concurrency = 5
    encoding = 'gbk'
    headers = {'User-Agent': 'Google Spider'}
    start_url = r'http://www.xinxin46.com/L/lilunpian.html'
    parsers = [Parser('/L/lilunpian\d+\.html',Post)]


MySpider.run()

Limit the interval between two requests.

class MySpider(Spider):
    interval = 5 #seconds
    headers = {'User-Agent': 'Google Spider'}
    start_url = 'https://blog.scrapinghub.com/'
    parsers = [Parser('https://blog.scrapinghub.com/page/\d+/'),
               Parser('https://blog.scrapinghub.com/\d{4}/\d{2}/\d{2}/[a-z0-9\-]+/', Post)]

Then a request after another reqeust should wait for 5 seconds. and the concurrency will be invalid.

Add architecture diagram.

Use pictures to express the whole idea of the project.

Test failed on Windows

(gain) E:\workspace\GitHub\gain>pytest
============================= test session starts =============================
platform win32 -- Python 3.6.1, pytest-3.1.2, py-1.4.34, pluggy-0.4.0
rootdir: E:\workspace\GitHub\gain, inifile:
collected 4 items / 1 errors

=================================== ERRORS ====================================
_________________ ERROR collecting tests/test_file_result.py __________________
ImportError while importing test module 'E:\workspace\GitHub\gain\tests\test_fi
e_result.py'.
Hint: make sure your test modules/packages have valid Python names.
Traceback:
tests\test_file_result.py:4: in
from gain.result import FileResult
E ModuleNotFoundError: No module named 'gain.result'
!!!!!!!!!!!!!!!!!!! Interrupted: 1 errors during collection !!!!!!!!!!!!!!!!!!!
=========================== 1 error in 0.51 seconds ===========================

bug

When accessing the 404 page, the url will be accessed all the time.

demo error

copy your basic demo code and run it:

error.

Traceback (most recent call last):
  File "b.py", line 23, in <module>
    MySpider.run()
  File "/home/qyy/anaconda3/envs/sanic/lib/python3.6/site-packages/gain/spider.py", line 52, in run
    loop.run_until_complete(cls.init_parse(semaphore))
  File "uvloop/loop.pyx", line 1451, in uvloop.loop.Loop.run_until_complete
  File "/home/qyy/anaconda3/envs/sanic/lib/python3.6/site-packages/gain/spider.py", line 71, in init_parse
    with aiohttp.ClientSession() as session:
  File "/home/qyy/anaconda3/envs/sanic/lib/python3.6/site-packages/aiohttp/client.py", line 956, in __enter__
    raise TypeError("Use async with instead")
TypeError: Use async with instead
[2019:04:08 15:05:18] Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7fc4d2eb8e48>
sys:1: RuntimeWarning: coroutine 'Parser.task' was never awaited

and ..

from gain import Css, Item, Parser, XPathParser, Spider
ImportError: cannot import name 'XPathParser'

Thanks.

File downloader.

class Post(Item):
    title = Css('.entry-title')
    pic =  Css('.pic', 'src')

If a field of an item is a file, then auto download it.

Any suggestions?

Add PhantomJS support.

Parsing AJAX driven page.

Please do a decent code review before accepting pull requests

As this project has already got nearly 800 stars on GitHub, please be responsible for each pull request you accept. the code in result directory is not qualified enough to be accepted.

Separate Follower from Parser.

Now:

Parser('/\d{4}/\d{2}/') # follow
Parser('/post/\d+$', Post) #parse

This would be better

Follower('/\d{4}/\d{2}/') # follow
Parser('/post/\d+$', Post) #parse

Parse multiple item from each page.

class Post(Item):
    title = Css('.entry-title')
    content = Css('.entry-content')

If a page has multiple item mathed the defined item model, Gain should have ability of parsing all of theme. I have an idea:

class Post(Item):
    __base_html__ =  Css('.entry')
    title = Css('.entry-title')
    content = Css('.entry-content')

Set item model a new attribute named base_html or another appropriate name, which describe where is each item. So that we can parse multiple item from each page.

Add some built-in save() methods.

For examples:

class Post(Item):
    id = Css('title')
    async def save(self):
        super.save(self.results, type='database')

class Post(Item):
    id = Css('title')
    async def save(self):
        super.save(self.results, type='file')

Do you have any suggestions?

Add full documentation.

Css selector add attr not work correctly

1.I write code content = Css('.download_button', 'href') in Class Post but not work.
Error info :

Selector ".video-download-button" for url was wrong, please check again

which means that value is None.In fact,execute code d(self.rule)[0].attr(self.attr, None) would terminate and output has not attr attribute...

2.Now selector just select the first element since the code d(self.rule)[0].text 。How can i choose the whole elements match self.rule and acquire their attr? (I searched the doc http://pyquery.readthedocs.io/en/latest/, but not found answer)

Regex selector support.

There are Css() and Xpath() already.
I think the Regex() is useful too.

class Post(Item):
    id = Regex('\d{32}')

Custom header.

def generate_header():
    header = {'User-agent': 'Google spider'}
    return header

class MySpider(Spider):
    start_url = 'https://blog.scrapinghub.com/'
    header =  generate_header
    concurrency = 5
    parsers = [Parser('https://blog.scrapinghub.com/page/\d+/'),
               Parser('https://blog.scrapinghub.com/\d{4}/\d{2}/\d{2}/[a-z0-9\-]+/', Post)]

and

class MySpider(Spider):
    start_url = 'https://blog.scrapinghub.com/'
    header =   {'User-agent': 'Google spider'}
    concurrency = 5
    parsers = [Parser('https://blog.scrapinghub.com/page/\d+/'),
               Parser('https://blog.scrapinghub.com/\d{4}/\d{2}/\d{2}/[a-z0-9\-]+/', Post)]

Both should be supported.

Gain Improvements - Ludaro

re.findall issue

I reviewed the tests in this project after experiencing issues with my regex also catching some html as part of the process.

So I reviewed this test file: https://github.com/gaojiuli/gain/blob/master/tests/test_parse_multiple_items.py and catched the response of abstract_url.py

Version 0.1.4 of this project catches this as response:

URLS we found: ['/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/']

re.findall returns what is requested by your regex but not what is matched!

Test incorrect

The base url http://quotes.toscrape.com/ and http://quotes.toscrape.com/page/1 are the same page and if you look into the html you shall only find a reference to "/page/2" but not to "/page/1". For this reason the test seems to work but it was actually flawed from the start.

re.match

I rewrote function abstract_url to:

    def abstract_urls(self, html, base_url):
        _urls = []

        try:
            document = lxml.html.fromstring(html)
            document_domain = urlparse.urlparse(base_url).netloc
            
            for (al, attr, link, pos) in document.iterlinks():
                link = re.sub("#.*", "", link or "")

                if not link:
                    continue

                _urls.append(link)
        except (etree.XMLSyntaxError, etree.ParserError) as e:
            logger.error("While parsing the html for {} we received the following error {}.".format(base_url, e))

        # Cleanup urls
        r = re.compile(self.rule)
        urls = list(filter(r.match, _urls))

        return urls

and now this is the result of abstract_url:

['/static/bootstrap.min.css', '/static/main.css', '/', '/login', '/author/Albert-Einstein', '/tag/change/page/1/', '/tag/deep-thoughts/page/1/', '/tag/thinking/page/1/', '/tag/world/page/1/', '/author/J-K-Rowling', '/tag/abilities/page/1/', '/tag/choices/page/1/', '/author/Albert-Einstein', '/tag/inspirational/page/1/', '/tag/life/page/1/', '/tag/live/page/1/', '/tag/miracle/page/1/', '/tag/miracles/page/1/', '/author/Jane-Austen', '/tag/aliteracy/page/1/', '/tag/books/page/1/', '/tag/classic/page/1/', '/tag/humor/page/1/', '/author/Marilyn-Monroe', '/tag/be-yourself/page/1/', '/tag/inspirational/page/1/', '/author/Albert-Einstein', '/tag/adulthood/page/1/', '/tag/success/page/1/', '/tag/value/page/1/', '/author/Andre-Gide', '/tag/life/page/1/', '/tag/love/page/1/', '/author/Thomas-A-Edison', '/tag/edison/page/1/', '/tag/failure/page/1/', '/tag/inspirational/page/1/', '/tag/paraphrased/page/1/', '/author/Eleanor-Roosevelt', '/tag/misattributed-eleanor-roosevelt/page/1/', '/author/Steve-Martin', '/tag/humor/page/1/', '/tag/obvious/page/1/', '/tag/simile/page/1/', '/page/2/', '/tag/love/', '/tag/inspirational/', '/tag/life/', '/tag/humor/', '/tag/books/', '/tag/reading/', '/tag/friendship/', '/tag/friends/', '/tag/truth/', '/tag/simile/', 'https://www.goodreads.com/quotes', 'https://scrapinghub.com']

This test: tests/test_parse_multiple_items.py now fails as it should.

Handle error when aiohttp response get wrong.

add encoding

request.py

import asyncio

from .log import logger

try:
    import uvloop

    asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
except ImportError:
    pass


async def fetch(url, spider, session, semaphore):
    with (await semaphore):
        try:
            if callable(spider.headers):
                headers = spider.headers()
            else:
                headers = spider.headers
            # hare   hare   hare
            if hasattr(spider,'encoding'):
                codec = spider.encoding
            else:
                codec = 'utf-8'
            # hare   hare   hare

            
            async with session.get(url, headers=headers) as response:
                if response.status in [200, 201]:
                    data = await response.text(encoding=codec)   # hare   hare   hare
                    return data
                logger.error('Error: {} {}'.format(url, response.status))
                return None
        except:
            return None

test.py

class MySpider(Spider):
    concurrency = 5
    encoding = 'gbk'
    start_url = r'http://blog.sciencenet.cn/home.php?mod=space&uid=40109&do=blog&view=me&from=space&page=1'
    parsers = [Parser('http://blog.sciencenet.cn/home.php.*?page=\d+',Post)]

Collecting gain
  Downloading gain-0.1.1.tar.gz
Collecting uvloop (from gain)
  Downloading uvloop-0.8.0.tar.gz (1.7MB)
    100% |################################| 1.7MB 534kB/s
    Complete output from command python setup.py egg_info:
    Traceback (most recent call last):
      File "<string>", line 1, in <module>
      File "C:\Users\idi\AppData\Local\Temp\pip-build-gjya289j\uvloop\setup.py", line 11, in <module>
        raise RuntimeError('uvloop does not support Windows at the moment')
    RuntimeError: uvloop does not support Windows at the moment