elliotgao2 / gain Goto Github PK
View Code? Open in Web Editor NEWWeb crawling framework based on asyncio.
License: GNU General Public License v3.0
Web crawling framework based on asyncio.
License: GNU General Public License v3.0
Support IP proxy.
class BaseParser(object):
def __init__(self, rule, item=None,attr='href'):
self.rule = rule
self.item = item
self.parsing_urls = []
self.pre_parse_urls = Queue()
self.filter_urls = set()
self.done_urls = []
self.attr = attr # hare
....ellipsis
class cssParser(BaseParser):
def abstract_urls(self, html):
urls = [pq(x).attr(self.attr) for x in pq(html)(self.rule)]
return urls
from gain import Css, Item, Parser, Spider, cssParser
from pyquery import PyQuery as pq
class Post(Item):
videoTitle = Css('div.ui-cnt ul.intro li h2 a.title')
videoType = Css('.intro > li:nth-child(1) > p', process_func=lambda pqObj:' '.join([x.text for x in pq(pqObj[0])('a')]))
videoAuthor = Css('.intro > li:nth-child(1) > p',process_func=lambda pqObj:' '.join([x.text for x in pq(pqObj[1])('a')]))
videoNotes = Css('.intro > li:nth-child(1) > p',process_func=lambda pqObj:pq(pqObj[2]).text())
videoLang = Css('.intro > li:nth-child(1) > p',process_func=lambda pqObj:pq(pqObj[3]).text())
videoRegion = Css('.intro > li:nth-child(1) > p',process_func=lambda pqObj:pq(pqObj[4]).text())
# title is List
async def save(self):
if hasattr(self,'videoTitle') and hasattr(self,'videoType') and hasattr(self,'videoAuthor') and hasattr(self,'videoNotes') and hasattr(self,'videoLang') and hasattr(self,'videoRegion'):
print('片名:%s' % self.videoTitle)
print('类型:%s' % self.videoType)
print('主演:%s' % self.videoAuthor)
print('%s' % self.videoNotes)
print('%s' % self.videoLang)
print('%s' % self.videoRegion)
print('-------')
class MySpider(Spider):
concurrency = 5
encoding = 'gbk'
headers = {'User-Agent': 'Google Spider'}
start_url = r'http://www.xinxin46.com/L/lilunpian.html'
parsers = [cssParser('.ui-pages a[href^="/L/lilunpian"]',attr='href'),
cssParser('.primary-list li h5 a[href^="/V/"]',attr='href'),
cssParser('.play-list a[href^="/player/"]',Post,attr='href'),
]
MySpider.run()
Concurrency 50, duplicated links super (fried chicken many kind of)
Don't believe it, try it yourself
from gain import Css, Item, Parser, Spider, cssParser,Xpath
from pyquery import PyQuery as pq
import re
import requests
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine
from sqlalchemy import Column, Integer, String
engine = create_engine('sqlite:////home/dde/test.db', echo=False)
Base = declarative_base()
class videoInfo(Base):
__tablename__ = 'users'
id = Column(Integer, primary_key=True)
videoTitle = Column(String)
videoType = Column(String)
videoAuthor = Column(String)
videoNotes = Column(String)
videoLang = Column(String)
videoRegion = Column(String)
videoPlayPage = Column(String)
videoPlayLink = Column(String)
Session = sessionmaker(bind=engine)
session = Session()
Base.metadata.create_all(engine)
class getVideoInfo(Item):
def filterPlayLink(link):
url = 'http://www.xinxin46.com%s' % link[0]
content = requests.get(url).text
playUrl = eval(re.findall(r'\[\[.*?\]\]\]', content)[0])[0][1]
result = str()
for x in playUrl:
line,playUrl,player = x.split('$')
result += 'player----{}----{}----{}\n'.format(player,line,playUrl)
# result = re.findall(r'/player/.*?/', content)[0][1:-1]+'$$$$'+ result
return result
videoTitle = Css('div.ui-cnt ul.intro li h2 a.title')
videoType = Css('.intro > li:nth-child(1) p', process_func=lambda pqObj: ' '.join([pq(x).text() for x in pq(pqObj[0])('a')]) if len(pq(pqObj[0])('a'))>0 else pq(pqObj).text())
videoAuthor = Css('.intro > li:nth-child(1) p',process_func=lambda pqObj:' '.join([pq(x).text() for x in pq(pqObj[1])('a')]) if len(pq(pqObj[1])('a'))>0 else pq(pqObj).text())
videoNotes = Css('.intro > li:nth-child(1) p',process_func=lambda pqObj:pq(pqObj[2]).text())
videoLang = Css('.intro > li:nth-child(1) p',process_func=lambda pqObj:pq(pqObj[3]).text())
videoRegion = Css('.intro > li:nth-child(1) p',process_func=lambda pqObj:pq(pqObj[4]).text())
videoPlayPage = Css('.play-list li a[href^="/player/"]',process_func=lambda pqObj:'\n'.join(['link----'+pq(x).text()+'----' +pq(x).attr('href') for x in pqObj]))
videoPlayLink = Xpath('/html/body/div[3]/div/div[1]/div[1]/script[1]/@src',process_func=filterPlayLink)
async def save(self):
if hasattr(self,'videoTitle')\
and hasattr(self,'videoType')\
and hasattr(self,'videoAuthor')\
and hasattr(self,'videoNotes')\
and hasattr(self,'videoLang')\
and hasattr(self,'videoRegion')\
and hasattr(self,'videoPlayPage')\
and hasattr(self,'videoPlayLink'):
"""
if self.videoPlayLink.find('qvod') >-1:
return
print('片名:%s' % self.videoTitle)
print('类型:%s' % self.videoType)
print('主演:%s' % self.videoAuthor)
print('%s' % self.videoNotes)
print('%s' % self.videoLang)
print('%s' % self.videoRegion)
print('%s' % self.videoPlayPage)
print('%s' % self.videoPlayLink)
print('-------')
"""
global session
addInfo = videoInfo(videoTitle=self.videoTitle,videoType=self.videoType,videoAuthor=self.videoAuthor,videoNotes=self.videoNotes,videoLang=self.videoLang,videoRegion=self.videoRegion,videoPlayPage=self.videoPlayPage,videoPlayLink=self.videoPlayLink)
session.add(addInfo)
session.commit()
class MySpider(Spider):
concurrency = 50
encoding = 'gbk'
headers = {'User-Agent': 'Google Spider'}
start_url = r'http://www.xinxin46.com/L/lilunpian.html'
parsers = [cssParser('.ui-pages a[href^="/L/lilunpian"]',attr='href'),
cssParser('.primary-list li h5 a[href^="/V/"]',attr='href'),
cssParser('.play-list a[href^="/player/"]',getVideoInfo,attr='href'),
]
MySpider.run()
session.close()
'''
import requests
a= requests.get('http://www.xinxin46.com/player/baishilingyincangjurudepusuOLshimingantizhidenvhaiFSET680/index-0-0.html').text
print(pq(a)('script[src^="/playdata/"]'))
'''
error: Microsoft Visual C++ 14.0 is required. Get it with "Microsoft Visual C++ Build Tools": http://landinghub.visualstudio.com/visual-cpp-build-tools
我们想在下载前或者下载后加一些东西,比如保存整个url。或者针对不同的状态码做不同的处理,比如3xx的重定向,5xx时重试等。
title = Css('.ph'),i don't know how to use this class,please help
When I ran the Usage code in README.md, a TypeError occured which refers to this line :
await f.write(self.results)
Then I changed this line to await f.write(self.results['title'])
and everything works just fine.
I noticed that in previous edition of this README file, when aiofile
was not introduced, this part of code used this dict self.results
the same way. So I'm not sure which is the right way to print the result.
The readme says:
pip install uvloop (Only linux)
Does it mean that gain
can't be used on a Mac?
I copied the examples/sciencenet_spider.py
example and tried to run it using python 3.6 - but:
python sciencenet_spider.py
[2018:04:14 22:21:26] Spider started!
[2018:04:14 22:21:26] Using selector: KqueueSelector
[2018:04:14 22:21:26] Base url: http://blog.sciencenet.cn/
[2018:04:14 22:21:26] Item "Post": 0
[2018:04:14 22:21:26] Requests count: 0
[2018:04:14 22:21:26] Error count: 0
[2018:04:14 22:21:26] Time usage: 0:00:00.001127
[2018:04:14 22:21:26] Spider finished!
Traceback (most recent call last):
File "sciencenet_spider.py", line 19, in <module>
MySpider.run()
File "/Users/endafarrell/anaconda/anaconda3/lib/python3.6/site-packages/gain/spider.py", line 52, in run
loop.run_until_complete(cls.init_parse(semaphore))
File "/Users/endafarrell/anaconda/anaconda3/lib/python3.6/asyncio/base_events.py", line 467, in run_until_complete
return future.result()
File "/Users/endafarrell/anaconda/anaconda3/lib/python3.6/site-packages/gain/spider.py", line 71, in init_parse
with aiohttp.ClientSession() as session:
File "/Users/endafarrell/anaconda/anaconda3/lib/python3.6/site-packages/aiohttp/client.py", line 746, in __enter__
raise TypeError("Use async with instead")
TypeError: Use async with instead
sys:1: RuntimeWarning: coroutine 'Parser.task' was never awaited
[2018:04:14 22:21:26] Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x105b07cf8>
My python is
python
Python 3.6.4 |Anaconda, Inc.| (default, Jan 16 2018, 12:04:33)
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)] on darwin
and I have:
pip list | grep gain
gain 0.1.4
I installed gain
using:
pip install gain
Any ideas?
Spider class add cookies field cause some websites need login
In parser.py file,there is await item.save()
, a function used to store information mostly in local file(user can override the function). As far as i'm concerned, code like
async def save(self):
with open('scrapinghub.txt', 'a+') as f:
f.writelines(str(self.results) + '\n')
is blocking as local filesystem access is blocking.Therefore,the event loop(Thread) is blocking.
Especially when we select a MB size file and want to store in local file, it would slow the whole application.
So, It's that possible use aiofile(File support for asyncio,https://github.com/Tinche/aiofiles) or use loop.run_in_executor makes save funciton run in another thread when the file is large?
For example:
from gain import Css, Item, Parser, Spider
class Post(Item):
title = Css('.entry-title')
content = Css('.entry-content')
async def save(self):
with open('scrapinghub.txt', 'a+') as f:
f.writelines(self.results['title'] + '\n')
# Add function to handle value
class Post(Item):
title = Css('.entry-title')
content = Css('.entry-content')
def clean_title(self,title):
return title.strip()
async def save(self):
with open('scrapinghub.txt', 'a+') as f:
f.writelines(self.results['title'] + '\n')
Then in https://github.com/gaojiuli/gain/blob/master/gain/item.py
class Item(metaclass=ItemType):
def __init__(self, html):
self.results = {}
for name, selector in self.selectors.items():
value = selector.parse_detail(html)
# Add function to handle value
get_field = getattr(self, 'clean_%s' % name, None)
if get_field:
value = get_field(value)
if value is None:
logger.error('Selector "{}" for {} was wrong, please check again'.format(selector.rule, name))
else:
self.results[name] = value
Need a cool logo meaning "efficient crawler".
[2018:10:25 16:14:03] Spider started!
[2018:10:25 16:14:03] Base url: https://blog.scrapinghub.com/
[2018:10:25 16:14:04] SSL handshake failed on verifying the certificate
protocol: <uvloop.loop.SSLProtocol object at 0x10729acc0>
transport: <TCPTransport closed=False reading=False 0x7fe65248c048>
Traceback (most recent call last):
File "uvloop/sslproto.pyx", line 609, in uvloop.loop.SSLProtocol._on_handshake_complete
File "uvloop/sslproto.pyx", line 171, in uvloop.loop._SSLPipe.feed_ssldata
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/ssl.py", line 763, in do_handshake
self._sslobj.do_handshake()
ssl.SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1045)
[2018:10:25 16:14:04] SSL error errno:1 reason: CERTIFICATE_VERIFY_FAILED
protocol: <uvloop.loop.SSLProtocol object at 0x10729acc0>
transport: <TCPTransport closed=False reading=False 0x7fe65248c048>
Traceback (most recent call last):
File "uvloop/sslproto.pyx", line 504, in uvloop.loop.SSLProtocol.data_received
File "uvloop/sslproto.pyx", line 204, in uvloop.loop._SSLPipe.feed_ssldata
File "uvloop/sslproto.pyx", line 171, in uvloop.loop._SSLPipe.feed_ssldata
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/ssl.py", line 763, in do_handshake
self._sslobj.do_handshake()
ssl.SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1045)
[2018:10:25 16:14:05] SSL handshake failed on verifying the certificate
protocol: <uvloop.loop.SSLProtocol object at 0x10729ae80>
transport: <TCPTransport closed=False reading=False 0x7fe6549908b8>
Traceback (most recent call last):
File "uvloop/sslproto.pyx", line 609, in uvloop.loop.SSLProtocol._on_handshake_complete
File "uvloop/sslproto.pyx", line 171, in uvloop.loop._SSLPipe.feed_ssldata
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/ssl.py", line 763, in do_handshake
self._sslobj.do_handshake()
ssl.SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1045)
[2018:10:25 16:14:05] SSL error errno:1 reason: CERTIFICATE_VERIFY_FAILED
protocol: <uvloop.loop.SSLProtocol object at 0x10729ae80>
transport: <TCPTransport closed=False reading=False 0x7fe6549908b8>
Traceback (most recent call last):
File "uvloop/sslproto.pyx", line 504, in uvloop.loop.SSLProtocol.data_received
File "uvloop/sslproto.pyx", line 204, in uvloop.loop._SSLPipe.feed_ssldata
File "uvloop/sslproto.pyx", line 171, in uvloop.loop._SSLPipe.feed_ssldata
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/ssl.py", line 763, in do_handshake
self._sslobj.do_handshake()
ssl.SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1045)
[2018:10:25 16:14:05] SSL handshake failed on verifying the certificate
protocol: <uvloop.loop.SSLProtocol object at 0x10729aeb8>
transport: <TCPTransport closed=False reading=False 0x7fe652738418>
Traceback (most recent call last):
File "uvloop/sslproto.pyx", line 609, in uvloop.loop.SSLProtocol._on_handshake_complete
File "uvloop/sslproto.pyx", line 171, in uvloop.loop._SSLPipe.feed_ssldata
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/ssl.py", line 763, in do_handshake
self._sslobj.do_handshake()
ssl.SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1045)
[2018:10:25 16:14:05] SSL error errno:1 reason: CERTIFICATE_VERIFY_FAILED
protocol: <uvloop.loop.SSLProtocol object at 0x10729aeb8>
transport: <TCPTransport closed=False reading=False 0x7fe652738418>
Traceback (most recent call last):
File "uvloop/sslproto.pyx", line 504, in uvloop.loop.SSLProtocol.data_received
File "uvloop/sslproto.pyx", line 204, in uvloop.loop._SSLPipe.feed_ssldata
File "uvloop/sslproto.pyx", line 171, in uvloop.loop._SSLPipe.feed_ssldata
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/ssl.py", line 763, in do_handshake
self._sslobj.do_handshake()
ssl.SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1045)
[2018:10:25 16:14:05] SSL handshake failed on verifying the certificate
protocol: <uvloop.loop.SSLProtocol object at 0x1072d0080>
transport: <TCPTransport closed=False reading=False 0x7fe654994308>
Traceback (most recent call last):
File "uvloop/sslproto.pyx", line 609, in uvloop.loop.SSLProtocol._on_handshake_complete
File "uvloop/sslproto.pyx", line 171, in uvloop.loop._SSLPipe.feed_ssldata
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/ssl.py", line 763, in do_handshake
self._sslobj.do_handshake()
ssl.SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1045)
[2018:10:25 16:14:05] SSL error errno:1 reason: CERTIFICATE_VERIFY_FAILED
protocol: <uvloop.loop.SSLProtocol object at 0x1072d0080>
transport: <TCPTransport closed=False reading=False 0x7fe654994308>
Traceback (most recent call last):
File "uvloop/sslproto.pyx", line 504, in uvloop.loop.SSLProtocol.data_received
File "uvloop/sslproto.pyx", line 204, in uvloop.loop._SSLPipe.feed_ssldata
File "uvloop/sslproto.pyx", line 171, in uvloop.loop._SSLPipe.feed_ssldata
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/ssl.py", line 763, in do_handshake
self._sslobj.do_handshake()
ssl.SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1045)
[2018:10:25 16:14:06] SSL handshake failed on verifying the certificate
protocol: <uvloop.loop.SSLProtocol object at 0x1072d0208>
transport: <TCPTransport closed=False reading=False 0x7fe6527bef38>
Traceback (most recent call last):
File "uvloop/sslproto.pyx", line 609, in uvloop.loop.SSLProtocol._on_handshake_complete
File "uvloop/sslproto.pyx", line 171, in uvloop.loop._SSLPipe.feed_ssldata
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/ssl.py", line 763, in do_handshake
self._sslobj.do_handshake()
ssl.SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1045)
[2018:10:25 16:14:06] SSL error errno:1 reason: CERTIFICATE_VERIFY_FAILED
protocol: <uvloop.loop.SSLProtocol object at 0x1072d0208>
transport: <TCPTransport closed=False reading=False 0x7fe6527bef38>
Traceback (most recent call last):
File "uvloop/sslproto.pyx", line 504, in uvloop.loop.SSLProtocol.data_received
File "uvloop/sslproto.pyx", line 204, in uvloop.loop._SSLPipe.feed_ssldata
File "uvloop/sslproto.pyx", line 171, in uvloop.loop._SSLPipe.feed_ssldata
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/ssl.py", line 763, in do_handshake
self._sslobj.do_handshake()
ssl.SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1045)
[2018:10:25 16:14:06] Item "Post": 0
[2018:10:25 16:14:06] Requests count: 0
[2018:10:25 16:14:06] Error count: 0
[2018:10:25 16:14:06] Time usage: 0:00:03.345306
[2018:10:25 16:14:06] Spider finished!
Process finished with exit code 0
Using Firefox 57, you can copy the XPath and CSS paths
import re
from lxml import etree
from pyquery import PyQuery as pq
class Selector:
def __init__(self, rule, attr=None,process_func=None):
self.rule = rule
self.attr = attr
self.process_func = process_func
def __str__(self):
return '{}({})'.format(self.__class__.__name__, self.rule)
def __repr__(self):
return '{}({})'.format(self.__class__.__name__, self.rule)
def parse_detail(self, html):
raise NotImplementedError
class Css(Selector):
def parse_detail(self, html):
d = pq(html)
if self.process_func:
try:
if self.rule != 'document':
d = d(self.rule)
results = self.process_func(d)
except IndexError:
return None
return results if results else None
if self.attr is None:
try:
return d(self.rule)[0].text
except IndexError:
return None
return d(self.rule)[0].attr(self.attr, None)
class Xpath(Selector):
def parse_detail(self, html):
d = etree.HTML(html)
if self.process_func:
try:
if self.rule != 'document':
d = d.xpath(self.rule)
results = self.process_func(d)
except IndexError:
return None
return results if results else None
try:
if self.attr is None:
return d.xpath(self.rule)[0].text
return d.xpath(self.rule)[0].get(self.attr, None)
except IndexError:
return None
class Regex(Selector):
def parse_detail(self, html):
try:
return re.findall(self.rule, html)[0]
except IndexError:
return None
In some cases, the creeper rules are complex and need to be resolved by themselves
from gain import Css, Item, Parser, Spider
class Post(Item):
title = Css('html body div#content div.layout.fn-clear div#primary.mainbox.fn-left div.ui-box.l-h div.ui-cnt ul.primary-list.min-video-list.fn-clear li h5 a', process_func=lambda pq:[x.text for x in pq])
# title is List
async def save(self):
if hasattr(self,'title'):
# title is List
for x in self.title:
print(x)
else:
print('error')
class MySpider(Spider):
concurrency = 5
encoding = 'gbk'
headers = {'User-Agent': 'Google Spider'}
start_url = r'http://www.xinxin46.com/L/lilunpian.html'
parsers = [Parser('/L/lilunpian\d+\.html',Post)]
MySpider.run()
class MySpider(Spider):
interval = 5 #seconds
headers = {'User-Agent': 'Google Spider'}
start_url = 'https://blog.scrapinghub.com/'
parsers = [Parser('https://blog.scrapinghub.com/page/\d+/'),
Parser('https://blog.scrapinghub.com/\d{4}/\d{2}/\d{2}/[a-z0-9\-]+/', Post)]
Then a request after another reqeust should wait for 5 seconds. and the concurrency
will be invalid.
Use pictures to express the whole idea of the project.
(gain) E:\workspace\GitHub\gain>pytest
============================= test session starts =============================
platform win32 -- Python 3.6.1, pytest-3.1.2, py-1.4.34, pluggy-0.4.0
rootdir: E:\workspace\GitHub\gain, inifile:
collected 4 items / 1 errors
=================================== ERRORS ====================================
_________________ ERROR collecting tests/test_file_result.py __________________
ImportError while importing test module 'E:\workspace\GitHub\gain\tests\test_fi
e_result.py'.
Hint: make sure your test modules/packages have valid Python names.
Traceback:
tests\test_file_result.py:4: in
from gain.result import FileResult
E ModuleNotFoundError: No module named 'gain.result'
!!!!!!!!!!!!!!!!!!! Interrupted: 1 errors during collection !!!!!!!!!!!!!!!!!!!
=========================== 1 error in 0.51 seconds ===========================
When accessing the 404 page, the url will be accessed all the time.
copy your basic demo code and run it:
error.
Traceback (most recent call last):
File "b.py", line 23, in <module>
MySpider.run()
File "/home/qyy/anaconda3/envs/sanic/lib/python3.6/site-packages/gain/spider.py", line 52, in run
loop.run_until_complete(cls.init_parse(semaphore))
File "uvloop/loop.pyx", line 1451, in uvloop.loop.Loop.run_until_complete
File "/home/qyy/anaconda3/envs/sanic/lib/python3.6/site-packages/gain/spider.py", line 71, in init_parse
with aiohttp.ClientSession() as session:
File "/home/qyy/anaconda3/envs/sanic/lib/python3.6/site-packages/aiohttp/client.py", line 956, in __enter__
raise TypeError("Use async with instead")
TypeError: Use async with instead
[2019:04:08 15:05:18] Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7fc4d2eb8e48>
sys:1: RuntimeWarning: coroutine 'Parser.task' was never awaited
and ..
from gain import Css, Item, Parser, XPathParser, Spider
ImportError: cannot import name 'XPathParser'
Thanks.
class Post(Item):
title = Css('.entry-title')
pic = Css('.pic', 'src')
If a field of an item is a file, then auto download it.
Any suggestions?
Parsing AJAX driven page.
As this project has already got nearly 800 stars on GitHub, please be responsible for each pull request you accept. the code in result directory is not qualified enough to be accepted.
Now:
Parser('/\d{4}/\d{2}/') # follow
Parser('/post/\d+$', Post) #parse
This would be better
Follower('/\d{4}/\d{2}/') # follow
Parser('/post/\d+$', Post) #parse
class Post(Item):
title = Css('.entry-title')
content = Css('.entry-content')
If a page has multiple item mathed the defined item model, Gain should have ability of parsing all of theme. I have an idea:
class Post(Item):
__base_html__ = Css('.entry')
title = Css('.entry-title')
content = Css('.entry-content')
Set item model a new attribute named base_html or another appropriate name, which describe where is each item. So that we can parse multiple item from each page.
For examples:
class Post(Item):
id = Css('title')
async def save(self):
super.save(self.results, type='database')
class Post(Item):
id = Css('title')
async def save(self):
super.save(self.results, type='file')
Do you have any suggestions?
Add full documentation.
1.I write code content = Css('.download_button', 'href')
in Class Post but not work.
Error info :
Selector ".video-download-button" for url was wrong, please check again
which means that value is None.In fact,execute code d(self.rule)[0].attr(self.attr, None)
would terminate and output has not attr attribute...
2.Now selector just select the first element since the code d(self.rule)[0].text
。How can i choose the whole elements match self.rule and acquire their attr? (I searched the doc http://pyquery.readthedocs.io/en/latest/, but not found answer)
There are Css() and Xpath() already.
I think the Regex() is useful too.
class Post(Item):
id = Regex('\d{32}')
def generate_header():
header = {'User-agent': 'Google spider'}
return header
class MySpider(Spider):
start_url = 'https://blog.scrapinghub.com/'
header = generate_header
concurrency = 5
parsers = [Parser('https://blog.scrapinghub.com/page/\d+/'),
Parser('https://blog.scrapinghub.com/\d{4}/\d{2}/\d{2}/[a-z0-9\-]+/', Post)]
and
class MySpider(Spider):
start_url = 'https://blog.scrapinghub.com/'
header = {'User-agent': 'Google spider'}
concurrency = 5
parsers = [Parser('https://blog.scrapinghub.com/page/\d+/'),
Parser('https://blog.scrapinghub.com/\d{4}/\d{2}/\d{2}/[a-z0-9\-]+/', Post)]
Both should be supported.
re.findall issue
I reviewed the tests in this project after experiencing issues with my regex also catching some html as part of the process.
So I reviewed this test file: https://github.com/gaojiuli/gain/blob/master/tests/test_parse_multiple_items.py and catched the response of abstract_url.py
Version 0.1.4 of this project catches this as response:
URLS we found: ['/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/', '/page/1/']
re.findall
returns what is requested by your regex but not what is matched!
Test incorrect
The base url http://quotes.toscrape.com/ and http://quotes.toscrape.com/page/1 are the same page and if you look into the html you shall only find a reference to "/page/2" but not to "/page/1". For this reason the test seems to work but it was actually flawed from the start.
re.match
I rewrote function abstract_url to:
def abstract_urls(self, html, base_url):
_urls = []
try:
document = lxml.html.fromstring(html)
document_domain = urlparse.urlparse(base_url).netloc
for (al, attr, link, pos) in document.iterlinks():
link = re.sub("#.*", "", link or "")
if not link:
continue
_urls.append(link)
except (etree.XMLSyntaxError, etree.ParserError) as e:
logger.error("While parsing the html for {} we received the following error {}.".format(base_url, e))
# Cleanup urls
r = re.compile(self.rule)
urls = list(filter(r.match, _urls))
return urls
and now this is the result of abstract_url:
['/static/bootstrap.min.css', '/static/main.css', '/', '/login', '/author/Albert-Einstein', '/tag/change/page/1/', '/tag/deep-thoughts/page/1/', '/tag/thinking/page/1/', '/tag/world/page/1/', '/author/J-K-Rowling', '/tag/abilities/page/1/', '/tag/choices/page/1/', '/author/Albert-Einstein', '/tag/inspirational/page/1/', '/tag/life/page/1/', '/tag/live/page/1/', '/tag/miracle/page/1/', '/tag/miracles/page/1/', '/author/Jane-Austen', '/tag/aliteracy/page/1/', '/tag/books/page/1/', '/tag/classic/page/1/', '/tag/humor/page/1/', '/author/Marilyn-Monroe', '/tag/be-yourself/page/1/', '/tag/inspirational/page/1/', '/author/Albert-Einstein', '/tag/adulthood/page/1/', '/tag/success/page/1/', '/tag/value/page/1/', '/author/Andre-Gide', '/tag/life/page/1/', '/tag/love/page/1/', '/author/Thomas-A-Edison', '/tag/edison/page/1/', '/tag/failure/page/1/', '/tag/inspirational/page/1/', '/tag/paraphrased/page/1/', '/author/Eleanor-Roosevelt', '/tag/misattributed-eleanor-roosevelt/page/1/', '/author/Steve-Martin', '/tag/humor/page/1/', '/tag/obvious/page/1/', '/tag/simile/page/1/', '/page/2/', '/tag/love/', '/tag/inspirational/', '/tag/life/', '/tag/humor/', '/tag/books/', '/tag/reading/', '/tag/friendship/', '/tag/friends/', '/tag/truth/', '/tag/simile/', 'https://www.goodreads.com/quotes', 'https://scrapinghub.com']
This test: tests/test_parse_multiple_items.py now fails as it should.
Handle error when aiohttp response get wrong.
import asyncio
from .log import logger
try:
import uvloop
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
except ImportError:
pass
async def fetch(url, spider, session, semaphore):
with (await semaphore):
try:
if callable(spider.headers):
headers = spider.headers()
else:
headers = spider.headers
# hare hare hare
if hasattr(spider,'encoding'):
codec = spider.encoding
else:
codec = 'utf-8'
# hare hare hare
async with session.get(url, headers=headers) as response:
if response.status in [200, 201]:
data = await response.text(encoding=codec) # hare hare hare
return data
logger.error('Error: {} {}'.format(url, response.status))
return None
except:
return None
class MySpider(Spider):
concurrency = 5
encoding = 'gbk'
start_url = r'http://blog.sciencenet.cn/home.php?mod=space&uid=40109&do=blog&view=me&from=space&page=1'
parsers = [Parser('http://blog.sciencenet.cn/home.php.*?page=\d+',Post)]
Check the last PR, that was closed by the author himself:
#50
Add homepage.
Add requirement (not windows client)
Collecting gain
Downloading gain-0.1.1.tar.gz
Collecting uvloop (from gain)
Downloading uvloop-0.8.0.tar.gz (1.7MB)
100% |################################| 1.7MB 534kB/s
Complete output from command python setup.py egg_info:
Traceback (most recent call last):
File "<string>", line 1, in <module>
File "C:\Users\idi\AppData\Local\Temp\pip-build-gjya289j\uvloop\setup.py", line 11, in <module>
raise RuntimeError('uvloop does not support Windows at the moment')
RuntimeError: uvloop does not support Windows at the moment
A declarative, efficient, and flexible JavaScript library for building user interfaces.
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
An Open Source Machine Learning Framework for Everyone
The Web framework for perfectionists with deadlines.
A PHP framework for web artisans
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
Some thing interesting about web. New door for the world.
A server is a program made to process requests and deliver data to clients.
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
Some thing interesting about visualization, use data art
Some thing interesting about game, make everyone happy.
We are working to build community through open source technology. NB: members must have two-factor auth.
Open source projects and samples from Microsoft.
Google ❤️ Open Source for everyone.
Alibaba Open Source for everyone
Data-Driven Documents codes.
China tencent open source team.