git-svn-id: svn://192.168.0.12/source@345 8346c931-da38-4b9b-9d4c-e48b93cbd075
This commit is contained in:
@@ -22,6 +22,6 @@
|
||||
|
||||
default=chrome
|
||||
kakaostory=chrome
|
||||
#instagram=firefox
|
||||
navercafe=firefox
|
||||
instagram=chrome
|
||||
navercafe=chrome
|
||||
#facebook=chrome
|
||||
|
||||
@@ -86,6 +86,7 @@ def requests_get(req, timeout=requests_timeout):
|
||||
if time.time() > (start + timeout):
|
||||
req.close()
|
||||
raise Exception("timeout")
|
||||
|
||||
return b''.join(body)
|
||||
|
||||
|
||||
@@ -313,6 +314,7 @@ def crawl_content_process(qu, keyword_id, db_num):
|
||||
break
|
||||
ok = True
|
||||
while ok:
|
||||
time.sleep(2)
|
||||
try:
|
||||
# get a instance of InstaContent by do_no_proxy func.
|
||||
# if element['url'] is invalid, content is None
|
||||
@@ -339,6 +341,7 @@ def crawl_content_process(qu, keyword_id, db_num):
|
||||
send_to_db.send_body(body)
|
||||
if replies:
|
||||
send_to_db.send_reply(replies)
|
||||
printl("proxies = ", content.proxies['http'][7:])
|
||||
printl(element['url'])
|
||||
printl('ok')
|
||||
ok = False
|
||||
@@ -411,9 +414,9 @@ class ListTag:
|
||||
self.load_url(url, self.proxies)
|
||||
|
||||
def load_url(self, url, proxies):
|
||||
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html(), proxies=proxies,
|
||||
timeout=requests_timeout, stream=True)
|
||||
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html(), proxies=proxies, timeout=requests_timeout, stream=True)
|
||||
content = requests_get(self.__r)
|
||||
|
||||
self.log_load_url_before()
|
||||
self.__r.raise_for_status()
|
||||
self.__tag = self.__get_tag(url)
|
||||
|
||||
@@ -6,7 +6,6 @@ def get_headers_for_list_html():
|
||||
" Chrome/50.0.2661.102 Safari/537.36"
|
||||
}
|
||||
|
||||
|
||||
def get_headers_for_body_html(cookies):
|
||||
if cookies:
|
||||
request_headers = {
|
||||
|
||||
@@ -4,6 +4,7 @@ import requests
|
||||
import datetime
|
||||
|
||||
rx_json_html = re.compile('window\._sharedData\s*=\s*(.*);\s*</script>')
|
||||
#rx_json_html = re.compile('window\._sharedData\s*=\s*(.*);</script>')
|
||||
old_date = datetime.datetime(1970, 1, 1, 9)
|
||||
|
||||
|
||||
@@ -17,7 +18,10 @@ def get_json_from_html(content):
|
||||
else:
|
||||
raise TypeError
|
||||
m = rx_json_html.search(s)
|
||||
|
||||
if m:
|
||||
#return json.dumps(json.loads(m.group(1)))
|
||||
#return json.loads(json.dumps(m.group(1)))
|
||||
return json.loads(m.group(1))
|
||||
else:
|
||||
raise TypeError("Check requests.response")
|
||||
@@ -50,14 +54,14 @@ def parse_list_user_html(content):
|
||||
|
||||
def parse_list_tag_html(content):
|
||||
json_data = get_json_from_html(content)
|
||||
tagpage = json_data['entry_data']['TagPage']
|
||||
tagpage = json_data["entry_data"]["TagPage"]
|
||||
|
||||
has_next = False
|
||||
end_cursor = None
|
||||
body_list = []
|
||||
if tagpage:
|
||||
print('start_cursor = ', end='', flush=True)
|
||||
print(tagpage[0]["tag"]["media"]["page_info"]["start_cursor"], flush=True)
|
||||
#print('start_cursor = ', end='', flush=True)
|
||||
#print(tagpage[0]["tag"]["media"]["page_info"]["start_cursor"], flush=True) #start_cursor doesn't exsist
|
||||
end_cursor = tagpage[0]["tag"]["media"]["page_info"]["end_cursor"]
|
||||
has_next = tagpage[0]["tag"]["media"]["page_info"]["has_next_page"]
|
||||
nodes = tagpage[0]["tag"]["media"]["nodes"]
|
||||
|
||||
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user