git-svn-id: svn://192.168.0.12/source@345 8346c931-da38-4b9b-9d4c-e48b93cbd075

This commit is contained in:
admin
2017-03-29 03:19:06 +00:00
parent ec45528679
commit 4fa93a7cc4
5 changed files with 17 additions and 11 deletions

View File

@@ -22,6 +22,6 @@
default=chrome
kakaostory=chrome
#instagram=firefox
navercafe=firefox
instagram=chrome
navercafe=chrome
#facebook=chrome

View File

@@ -86,6 +86,7 @@ def requests_get(req, timeout=requests_timeout):
if time.time() > (start + timeout):
req.close()
raise Exception("timeout")
return b''.join(body)
@@ -313,6 +314,7 @@ def crawl_content_process(qu, keyword_id, db_num):
break
ok = True
while ok:
time.sleep(2)
try:
# get a instance of InstaContent by do_no_proxy func.
# if element['url'] is invalid, content is None
@@ -339,6 +341,7 @@ def crawl_content_process(qu, keyword_id, db_num):
send_to_db.send_body(body)
if replies:
send_to_db.send_reply(replies)
printl("proxies = ", content.proxies['http'][7:])
printl(element['url'])
printl('ok')
ok = False
@@ -411,9 +414,9 @@ class ListTag:
self.load_url(url, self.proxies)
def load_url(self, url, proxies):
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html(), proxies=proxies,
timeout=requests_timeout, stream=True)
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html(), proxies=proxies, timeout=requests_timeout, stream=True)
content = requests_get(self.__r)
self.log_load_url_before()
self.__r.raise_for_status()
self.__tag = self.__get_tag(url)

View File

@@ -6,7 +6,6 @@ def get_headers_for_list_html():
" Chrome/50.0.2661.102 Safari/537.36"
}
def get_headers_for_body_html(cookies):
if cookies:
request_headers = {

View File

@@ -4,6 +4,7 @@ import requests
import datetime
rx_json_html = re.compile('window\._sharedData\s*=\s*(.*);\s*</script>')
#rx_json_html = re.compile('window\._sharedData\s*=\s*(.*);</script>')
old_date = datetime.datetime(1970, 1, 1, 9)
@@ -17,7 +18,10 @@ def get_json_from_html(content):
else:
raise TypeError
m = rx_json_html.search(s)
if m:
#return json.dumps(json.loads(m.group(1)))
#return json.loads(json.dumps(m.group(1)))
return json.loads(m.group(1))
else:
raise TypeError("Check requests.response")
@@ -50,14 +54,14 @@ def parse_list_user_html(content):
def parse_list_tag_html(content):
json_data = get_json_from_html(content)
tagpage = json_data['entry_data']['TagPage']
tagpage = json_data["entry_data"]["TagPage"]
has_next = False
end_cursor = None
body_list = []
if tagpage:
print('start_cursor = ', end='', flush=True)
print(tagpage[0]["tag"]["media"]["page_info"]["start_cursor"], flush=True)
#print('start_cursor = ', end='', flush=True)
#print(tagpage[0]["tag"]["media"]["page_info"]["start_cursor"], flush=True) #start_cursor doesn't exsist
end_cursor = tagpage[0]["tag"]["media"]["page_info"]["end_cursor"]
has_next = tagpage[0]["tag"]["media"]["page_info"]["has_next_page"]
nodes = tagpage[0]["tag"]["media"]["nodes"]

File diff suppressed because one or more lines are too long