- instagram 크롤러 수정 (Tag, Comment, User)

- Debug 메시지 수정


git-svn-id: svn://192.168.0.12/source@351 8346c931-da38-4b9b-9d4c-e48b93cbd075
This commit is contained in:
admin
2017-06-29 03:12:39 +00:00
parent b2e4fdadb3
commit 1e449a45af
2 changed files with 109 additions and 44 deletions

View File

@@ -12,6 +12,7 @@ import threading
import re import re
import pymysql import pymysql
import random import random
import inspect
from time import localtime, strftime from time import localtime, strftime
@@ -24,13 +25,37 @@ from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
is_debug = False is_debug = False
def is_debugger_attached():
for frame in inspect.stack():
if frame[1].endswith("pydevd.py"):
return True
return False
def printl(*objects, sep=' ', end='\n', file=None, flush=True): def printl(*objects, sep=' ', end='\n', file=None, flush=True):
if is_debug:
cur_frame = inspect.currentframe()
call_frame = inspect.getouterframes(cur_frame, 2)
frame_no = call_frame[1][3] == 'printd' and 2 or 1
file_path = call_frame[frame_no][1]
line_no = call_frame[frame_no][2]
# class_name = ''
# if 'self' in call_frame[frame_no][0].f_locals:
# class_name = str(call_frame[frame_no][0].f_locals['self'].__class__)
# method_name = call_frame[frame_no][3]
try:
# print('{}({}) [{}.{}] : {}'.format(file_path, line_no, class_name, method_name, *objects), sep=sep, end=end, file=file, flush=flush)
print('{}({}) : {}'.format(file_path, line_no, *objects), sep=sep, end=end, file=file, flush=flush)
except Exception as e:
print(e)
else:
print(*objects, sep=sep, end=end, file=file, flush=flush) print(*objects, sep=sep, end=end, file=file, flush=flush)
def printd(*objects, sep=' ', end='\n', file=None, flush=True): def printd(*objects, sep=' ', end='\n', file=None, flush=True):
if is_debug: if is_debug:
print(*objects, sep=sep, end=end, file=file, flush=flush) printl(objects, sep, end, file, flush)
def print_and_flush(string): def print_and_flush(string):

View File

@@ -16,18 +16,39 @@ from queue import Queue
import threading import threading
import time import time
import sys import sys
import inspect
from base.baseclasses import SendtoDB from base.baseclasses import SendtoDB
from base.baseclasses import CrawlInit from base.baseclasses import CrawlInit
from base.baseclasses import wait from base.baseclasses import wait
from base.baseclasses import is_debugger_attached
# from base.baseclasses import Browser # from base.baseclasses import Browser
from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.keys import Keys
from base.baseclasses import enter_element from base.baseclasses import enter_element
import base.proxy import base.proxy
import eventlet import eventlet
def printl(*objects, sep=' ', end='\n', file=None, flush=True): def printl(*objects, sep=' ', end='\n', file=None, flush=True):
if is_debug:
cur_frame = inspect.currentframe()
call_frame = inspect.getouterframes(cur_frame, 2)
frame_no = call_frame[1][3] == 'printd' and 2 or 1
file_path = call_frame[frame_no][1]
line_no = call_frame[frame_no][2]
# class_name = ''
# if 'self' in call_frame[frame_no][0].f_locals:
# class_name = str(call_frame[frame_no][0].f_locals['self'].__class__)
# method_name = call_frame[frame_no][3]
try:
# print('{}({}) [{}.{}] : {}'.format(file_path, line_no, class_name, method_name, *objects), sep=sep, end=end, file=file, flush=flush)
print('{}({}) : {}'.format(file_path, line_no, *objects), sep=sep, end=end, file=file, flush=flush)
except Exception as e:
print(e)
else:
print(*objects, sep=sep, end=end, file=file, flush=flush) print(*objects, sep=sep, end=end, file=file, flush=flush)
insta_url = "https://www.instagram.com/" insta_url = "https://www.instagram.com/"
@@ -35,13 +56,12 @@ insta_tag_url = "https://www.instagram.com/explore/tags/"
insta_query = "https://www.instagram.com/query/" insta_query = "https://www.instagram.com/query/"
insta_body_url = 'https://www.instagram.com/p/' insta_body_url = 'https://www.instagram.com/p/'
is_debuging = False is_debug = is_debugger_attached()
is_debug = False
def printd(*objects, sep=' ', end='\n', file=None, flush=True): def printd(*objects, sep=' ', end='\n', file=None, flush=True):
if is_debug: if is_debug:
print(*objects, sep=sep, end=end, file=file, flush=flush) printl(*objects, sep=sep, end=end, file=file, flush=flush)
num_of_list_ajax = 24 num_of_list_ajax = 24
@@ -232,7 +252,7 @@ def ajax_wrapper(func):
return retry_ajax_load return retry_ajax_load
@ajax_wrapper # @ajax_wrapper
def load_ajax_list(ins): def load_ajax_list(ins):
try: try:
insta_list = ins.load_more() insta_list = ins.load_more()
@@ -247,7 +267,7 @@ def load_ajax_list(ins):
return None return None
@ajax_wrapper # @ajax_wrapper
def load_ajax_reply(ins): def load_ajax_reply(ins):
try: try:
replies = ins.load_reply_more() replies = ins.load_reply_more()
@@ -307,7 +327,7 @@ def crawl_content_process(qu, keyword_id, db_num):
try: try:
element = qu.get(timeout=60) element = qu.get(timeout=60)
except Exception as e: except Exception as e:
printl("getting queue is timeout") printl("[crawl_content_process] queue is empty")
continue continue
if element is None: if element is None:
@@ -428,18 +448,35 @@ class ListTag:
return self.list_tag return self.list_tag
def load_more(self): def load_more(self):
form_data = instaheaders.get_form_data_for_list_tag(self.__tag, self.end_cursor, num_of_list_ajax) url = self.__url + "?max_id="+self.end_cursor
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data) self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html(), proxies=self.proxies,
self.log_load_more_before(form_data, headers)
self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies,
timeout=requests_timeout, stream=True) timeout=requests_timeout, stream=True)
content = requests_get(self.__r) content = requests_get(self.__r)
self.__set_cookies(self.__r.cookies)
self.log_load_url_before()
self.__r.raise_for_status() self.__r.raise_for_status()
# self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content) self.__tag = self.__get_tag(url)
self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_ajax(content) self.__set_cookies(self.__r.cookies)
# self.__url = url
# self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(self.__r.content)
self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(content)
self.__r.close() self.__r.close()
self.log_load_more_after() self.log_load_url_after()
# 기존 방식 instagram에서 post를 막은 듯
# form_data = instaheaders.get_form_data_for_list_tag(self.__tag, self.end_cursor, num_of_list_ajax)
# headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data)
# self.log_load_more_before(form_data, headers)
# self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies,
# timeout=requests_timeout, stream=True)
# content = requests_get(self.__r)
# self.__set_cookies(self.__r.cookies)
# self.__r.raise_for_status()
# # self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content)
# self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_ajax(content)
# self.__r.close()
# self.log_load_more_after()
return self.list_tag return self.list_tag
def __get_tag(self, url): def __get_tag(self, url):
@@ -472,14 +509,14 @@ class ListTag:
return self.proxies return self.proxies
def log_load_url_before(self): def log_load_url_before(self):
if is_debuging: if is_debug:
printl("<ListTag Start>") printl("<ListTag Start>")
printl("<ListTag requests>") printl("<ListTag requests>")
printl('headers = ', end=' ') printl('headers = ', end=' ')
printl(instaheaders.get_headers_for_list_html()) printl(instaheaders.get_headers_for_list_html())
def log_load_url_after(self): def log_load_url_after(self):
if is_debuging: if is_debug:
printl("<ListTag response>") printl("<ListTag response>")
printl('self.__r.cookies=', end='') printl('self.__r.cookies=', end='')
printl(self.__r.cookies) printl(self.__r.cookies)
@@ -491,7 +528,7 @@ class ListTag:
printl("<ListTag End>") printl("<ListTag End>")
def log_load_more_before(self, form_data, headers): def log_load_more_before(self, form_data, headers):
if is_debuging: if is_debug:
printl("<ListTag Start>") printl("<ListTag Start>")
printl("<ListTag requests>") printl("<ListTag requests>")
printl('end_cursor = ' + str(self.end_cursor)) printl('end_cursor = ' + str(self.end_cursor))
@@ -500,7 +537,7 @@ class ListTag:
printl(headers) printl(headers)
def log_load_more_after(self): def log_load_more_after(self):
if is_debuging: if is_debug:
printl("<ListTag response>") printl("<ListTag response>")
printl('self.__r.cookies=', end='') printl('self.__r.cookies=', end='')
printl(self.__r.cookies) printl(self.__r.cookies)
@@ -537,19 +574,21 @@ class ListUser:
return self.list_user return self.list_user
def load_more(self): def load_more(self):
form_data = instaheaders.get_form_data_for_list_user(self.__user, self.end_cursor, num_of_list_ajax) url = self.__url + "?max_id=" + self.end_cursor
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data) self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html(), proxies=self.proxies,
self.log_load_more_before(form_data, headers)
self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies,
timeout=requests_timeout, stream=True) timeout=requests_timeout, stream=True)
# form_data = instaheaders.get_form_data_for_list_user(self.__user, self.end_cursor, num_of_list_ajax)
# headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data)
# self.log_load_more_before(form_data, headers)
# self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies,
# timeout=requests_timeout, stream=True)
content = requests_get(self.__r) content = requests_get(self.__r)
self.__r.raise_for_status() self.__r.raise_for_status()
self.__set_cookies(self.__r.cookies) self.__set_cookies(self.__r.cookies)
# self.list_user, self.end_cursor, self.has_next = instaparser.parse_list_ajax(content)
# self.list_user, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content) self.list_user, self.end_cursor, self.has_next, self.__user = instaparser.parse_list_user_html(content)
self.list_user, self.end_cursor, self.has_next = instaparser.parse_list_ajax(content)
self.__r.close() self.__r.close()
self.log_load_more_after() # self.log_load_more_after()
return self.list_user return self.list_user
def get_cookies(self): def get_cookies(self):
@@ -575,7 +614,7 @@ class ListUser:
return self.proxies return self.proxies
def log_load_more_before(self, form_data, headers): def log_load_more_before(self, form_data, headers):
if is_debuging: if is_debug:
printl("<ListUser Start>") printl("<ListUser Start>")
printl("<ListUser requests>") printl("<ListUser requests>")
printl('end_cursor = ' + str(self.end_cursor)) printl('end_cursor = ' + str(self.end_cursor))
@@ -584,7 +623,7 @@ class ListUser:
printl(headers) printl(headers)
def log_load_more_after(self): def log_load_more_after(self):
if is_debuging: if is_debug:
printl("<ListUser response>") printl("<ListUser response>")
printl('self.__r.cookies=', end='') printl('self.__r.cookies=', end='')
printl(self.__r.cookies) printl(self.__r.cookies)
@@ -630,18 +669,17 @@ class InstaContent:
return self.reply return self.reply
def load_reply_more(self): def load_reply_more(self):
form_data = instaheaders.get_form_data_for_reply(self.__code, self.start_cursor, num_of_reply_ajax) url = self.__referer + "?max_id="+self.start_cursor
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__referer, form_data) # self.log_load_reply_more_before(form_data, headers)
self.log_load_reply_more_before(form_data, headers) self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies), proxies=self.proxies,
self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies,
timeout=requests_timeout, stream=True) timeout=requests_timeout, stream=True)
content = requests_get(self.__r) content = requests_get(self.__r)
self.__r.raise_for_status() self.__r.raise_for_status()
self.__code = self.__get_code(url)
self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(content)
self.__set_cookies(self.__r.cookies) self.__set_cookies(self.__r.cookies)
# self.reply, self.start_cursor, self.has_previous = instaparser.parse_reply_ajax(self.__r.content)
self.reply, self.start_cursor, self.has_previous = instaparser.parse_reply_ajax(content)
self.__r.close() self.__r.close()
self.log_load_reply_more_after() # self.log_load_reply_more_after()
return self.reply return self.reply
def get_cookies(self): def get_cookies(self):
@@ -662,7 +700,7 @@ class InstaContent:
return self.proxies return self.proxies
def log_load_reply_more_before(self, form_data, headers): def log_load_reply_more_before(self, form_data, headers):
if is_debuging: if is_debug:
printl("<ContentReply Start>") printl("<ContentReply Start>")
printl("<ContentReply requests>") printl("<ContentReply requests>")
printl('start_cursor = ' + self.start_cursor) printl('start_cursor = ' + self.start_cursor)
@@ -671,7 +709,7 @@ class InstaContent:
printl(headers) printl(headers)
def log_load_reply_more_after(self): def log_load_reply_more_after(self):
if is_debuging: if is_debug:
printl("<ContentReply response>") printl("<ContentReply response>")
printl('self.__r.cookies=', end='') printl('self.__r.cookies=', end='')
printl(self.__r.cookies) printl(self.__r.cookies)
@@ -722,7 +760,7 @@ class InstaAlgorithm:
self.close() self.close()
def close(self): def close(self):
if self.driver and not is_debuging: if self.driver and not is_debug:
self.driver.quit() self.driver.quit()
self.send_to_db.close() self.send_to_db.close()
printl("Finished Crawling :)") printl("Finished Crawling :)")
@@ -760,6 +798,7 @@ class InstaAlgorithm:
# printl(element['date'].strftime("%Y-%m-%d %H:%M:%S")) # printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
# wait(1.5) # wait(1.5)
# self.crawl_content(element['url'], list_crawler.get_cookies(), list_crawler.get_url()) # self.crawl_content(element['url'], list_crawler.get_cookies(), list_crawler.get_url())
printl("element insert to queue {}".format(element['url']))
self.list_crawl.append(element) self.list_crawl.append(element)
backup_set.add(element['url']) backup_set.add(element['url'])
if self.is_until_page(): if self.is_until_page():
@@ -897,6 +936,7 @@ class InstaAlgorithmMulti(InstaAlgorithm):
# p_list = [multiprocessing.Process(target=crawl_content_process, # p_list = [multiprocessing.Process(target=crawl_content_process,
# args=(self.list_crawl, self.keyword_id, self.send_to_db.db_num)) # args=(self.list_crawl, self.keyword_id, self.send_to_db.db_num))
# for i in range(num_of_content_process)] # for i in range(num_of_content_process)]
printl("{} processs start".format(num_of_content_process))
p_list = [threading.Thread(target=crawl_content_process, p_list = [threading.Thread(target=crawl_content_process,
args=(self.list_crawl, self.keyword_id, self.send_to_db.db_num)) args=(self.list_crawl, self.keyword_id, self.send_to_db.db_num))
for i in range(num_of_content_process)] for i in range(num_of_content_process)]