git-svn-id: svn://192.168.0.12/source@345 8346c931-da38-4b9b-9d4c-e48b93cbd075

This commit is contained in:
admin
2017-03-29 03:19:06 +00:00
parent ec45528679
commit 4fa93a7cc4
5 changed files with 17 additions and 11 deletions

View File

@@ -86,6 +86,7 @@ def requests_get(req, timeout=requests_timeout):
if time.time() > (start + timeout):
req.close()
raise Exception("timeout")
return b''.join(body)
@@ -313,6 +314,7 @@ def crawl_content_process(qu, keyword_id, db_num):
break
ok = True
while ok:
time.sleep(2)
try:
# get a instance of InstaContent by do_no_proxy func.
# if element['url'] is invalid, content is None
@@ -339,6 +341,7 @@ def crawl_content_process(qu, keyword_id, db_num):
send_to_db.send_body(body)
if replies:
send_to_db.send_reply(replies)
printl("proxies = ", content.proxies['http'][7:])
printl(element['url'])
printl('ok')
ok = False
@@ -411,15 +414,15 @@ class ListTag:
self.load_url(url, self.proxies)
def load_url(self, url, proxies):
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html(), proxies=proxies,
timeout=requests_timeout, stream=True)
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html(), proxies=proxies, timeout=requests_timeout, stream=True)
content = requests_get(self.__r)
self.log_load_url_before()
self.__r.raise_for_status()
self.__tag = self.__get_tag(url)
self.__set_cookies(self.__r.cookies)
self.__url = url
# self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(self.__r.content)
#self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(self.__r.content)
self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(content)
self.__r.close()
self.log_load_url_after()
@@ -1033,7 +1036,7 @@ class InstaMainCrawler:
def __init__(self):
self.send_to_db = SendtoDB()
self.crawl_init = InstaInit()
# self.browser = Browser()
#self.browser = Browser()
self.browser = None
self.driver = None