- instagram 크롤러 수정 (Tag, Comment, User)

- Debug 메시지 수정


git-svn-id: svn://192.168.0.12/source@351 8346c931-da38-4b9b-9d4c-e48b93cbd075
This commit is contained in:
admin
2017-06-29 03:12:39 +00:00
parent b2e4fdadb3
commit 1e449a45af
2 changed files with 109 additions and 44 deletions

View File

@@ -12,6 +12,7 @@ import threading
import re
import pymysql
import random
import inspect
from time import localtime, strftime
@@ -24,13 +25,37 @@ from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
is_debug = False
def is_debugger_attached():
for frame in inspect.stack():
if frame[1].endswith("pydevd.py"):
return True
return False
def printl(*objects, sep=' ', end='\n', file=None, flush=True):
print(*objects, sep=sep, end=end, file=file, flush=flush)
if is_debug:
cur_frame = inspect.currentframe()
call_frame = inspect.getouterframes(cur_frame, 2)
frame_no = call_frame[1][3] == 'printd' and 2 or 1
file_path = call_frame[frame_no][1]
line_no = call_frame[frame_no][2]
# class_name = ''
# if 'self' in call_frame[frame_no][0].f_locals:
# class_name = str(call_frame[frame_no][0].f_locals['self'].__class__)
# method_name = call_frame[frame_no][3]
try:
# print('{}({}) [{}.{}] : {}'.format(file_path, line_no, class_name, method_name, *objects), sep=sep, end=end, file=file, flush=flush)
print('{}({}) : {}'.format(file_path, line_no, *objects), sep=sep, end=end, file=file, flush=flush)
except Exception as e:
print(e)
else:
print(*objects, sep=sep, end=end, file=file, flush=flush)
def printd(*objects, sep=' ', end='\n', file=None, flush=True):
if is_debug:
print(*objects, sep=sep, end=end, file=file, flush=flush)
printl(objects, sep, end, file, flush)
def print_and_flush(string):

View File

@@ -16,32 +16,52 @@ from queue import Queue
import threading
import time
import sys
import inspect
from base.baseclasses import SendtoDB
from base.baseclasses import CrawlInit
from base.baseclasses import wait
from base.baseclasses import is_debugger_attached
# from base.baseclasses import Browser
from selenium.webdriver.common.keys import Keys
from base.baseclasses import enter_element
import base.proxy
import eventlet
def printl(*objects, sep=' ', end='\n', file=None, flush=True):
print(*objects, sep=sep, end=end, file=file, flush=flush)
if is_debug:
cur_frame = inspect.currentframe()
call_frame = inspect.getouterframes(cur_frame, 2)
frame_no = call_frame[1][3] == 'printd' and 2 or 1
file_path = call_frame[frame_no][1]
line_no = call_frame[frame_no][2]
# class_name = ''
# if 'self' in call_frame[frame_no][0].f_locals:
# class_name = str(call_frame[frame_no][0].f_locals['self'].__class__)
# method_name = call_frame[frame_no][3]
try:
# print('{}({}) [{}.{}] : {}'.format(file_path, line_no, class_name, method_name, *objects), sep=sep, end=end, file=file, flush=flush)
print('{}({}) : {}'.format(file_path, line_no, *objects), sep=sep, end=end, file=file, flush=flush)
except Exception as e:
print(e)
else:
print(*objects, sep=sep, end=end, file=file, flush=flush)
insta_url = "https://www.instagram.com/"
insta_tag_url = "https://www.instagram.com/explore/tags/"
insta_query = "https://www.instagram.com/query/"
insta_body_url = 'https://www.instagram.com/p/'
is_debuging = False
is_debug = False
is_debug = is_debugger_attached()
def printd(*objects, sep=' ', end='\n', file=None, flush=True):
if is_debug:
print(*objects, sep=sep, end=end, file=file, flush=flush)
printl(*objects, sep=sep, end=end, file=file, flush=flush)
num_of_list_ajax = 24
@@ -232,7 +252,7 @@ def ajax_wrapper(func):
return retry_ajax_load
@ajax_wrapper
# @ajax_wrapper
def load_ajax_list(ins):
try:
insta_list = ins.load_more()
@@ -247,7 +267,7 @@ def load_ajax_list(ins):
return None
@ajax_wrapper
# @ajax_wrapper
def load_ajax_reply(ins):
try:
replies = ins.load_reply_more()
@@ -307,7 +327,7 @@ def crawl_content_process(qu, keyword_id, db_num):
try:
element = qu.get(timeout=60)
except Exception as e:
printl("getting queue is timeout")
printl("[crawl_content_process] queue is empty")
continue
if element is None:
@@ -428,18 +448,35 @@ class ListTag:
return self.list_tag
def load_more(self):
form_data = instaheaders.get_form_data_for_list_tag(self.__tag, self.end_cursor, num_of_list_ajax)
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data)
self.log_load_more_before(form_data, headers)
self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies,
timeout=requests_timeout, stream=True)
url = self.__url + "?max_id="+self.end_cursor
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html(), proxies=self.proxies,
timeout=requests_timeout, stream=True)
content = requests_get(self.__r)
self.__set_cookies(self.__r.cookies)
self.log_load_url_before()
self.__r.raise_for_status()
# self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content)
self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_ajax(content)
self.__tag = self.__get_tag(url)
self.__set_cookies(self.__r.cookies)
# self.__url = url
# self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(self.__r.content)
self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(content)
self.__r.close()
self.log_load_more_after()
self.log_load_url_after()
# 기존 방식 instagram에서 post를 막은 듯
# form_data = instaheaders.get_form_data_for_list_tag(self.__tag, self.end_cursor, num_of_list_ajax)
# headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data)
# self.log_load_more_before(form_data, headers)
# self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies,
# timeout=requests_timeout, stream=True)
# content = requests_get(self.__r)
# self.__set_cookies(self.__r.cookies)
# self.__r.raise_for_status()
# # self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content)
# self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_ajax(content)
# self.__r.close()
# self.log_load_more_after()
return self.list_tag
def __get_tag(self, url):
@@ -472,14 +509,14 @@ class ListTag:
return self.proxies
def log_load_url_before(self):
if is_debuging:
if is_debug:
printl("<ListTag Start>")
printl("<ListTag requests>")
printl('headers = ', end=' ')
printl(instaheaders.get_headers_for_list_html())
def log_load_url_after(self):
if is_debuging:
if is_debug:
printl("<ListTag response>")
printl('self.__r.cookies=', end='')
printl(self.__r.cookies)
@@ -491,7 +528,7 @@ class ListTag:
printl("<ListTag End>")
def log_load_more_before(self, form_data, headers):
if is_debuging:
if is_debug:
printl("<ListTag Start>")
printl("<ListTag requests>")
printl('end_cursor = ' + str(self.end_cursor))
@@ -500,7 +537,7 @@ class ListTag:
printl(headers)
def log_load_more_after(self):
if is_debuging:
if is_debug:
printl("<ListTag response>")
printl('self.__r.cookies=', end='')
printl(self.__r.cookies)
@@ -537,19 +574,21 @@ class ListUser:
return self.list_user
def load_more(self):
form_data = instaheaders.get_form_data_for_list_user(self.__user, self.end_cursor, num_of_list_ajax)
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data)
self.log_load_more_before(form_data, headers)
self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies,
timeout=requests_timeout, stream=True)
url = self.__url + "?max_id=" + self.end_cursor
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html(), proxies=self.proxies,
timeout=requests_timeout, stream=True)
# form_data = instaheaders.get_form_data_for_list_user(self.__user, self.end_cursor, num_of_list_ajax)
# headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data)
# self.log_load_more_before(form_data, headers)
# self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies,
# timeout=requests_timeout, stream=True)
content = requests_get(self.__r)
self.__r.raise_for_status()
self.__set_cookies(self.__r.cookies)
# self.list_user, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content)
self.list_user, self.end_cursor, self.has_next = instaparser.parse_list_ajax(content)
# self.list_user, self.end_cursor, self.has_next = instaparser.parse_list_ajax(content)
self.list_user, self.end_cursor, self.has_next, self.__user = instaparser.parse_list_user_html(content)
self.__r.close()
self.log_load_more_after()
# self.log_load_more_after()
return self.list_user
def get_cookies(self):
@@ -575,7 +614,7 @@ class ListUser:
return self.proxies
def log_load_more_before(self, form_data, headers):
if is_debuging:
if is_debug:
printl("<ListUser Start>")
printl("<ListUser requests>")
printl('end_cursor = ' + str(self.end_cursor))
@@ -584,7 +623,7 @@ class ListUser:
printl(headers)
def log_load_more_after(self):
if is_debuging:
if is_debug:
printl("<ListUser response>")
printl('self.__r.cookies=', end='')
printl(self.__r.cookies)
@@ -630,18 +669,17 @@ class InstaContent:
return self.reply
def load_reply_more(self):
form_data = instaheaders.get_form_data_for_reply(self.__code, self.start_cursor, num_of_reply_ajax)
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__referer, form_data)
self.log_load_reply_more_before(form_data, headers)
self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies,
timeout=requests_timeout, stream=True)
url = self.__referer + "?max_id="+self.start_cursor
# self.log_load_reply_more_before(form_data, headers)
self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies), proxies=self.proxies,
timeout=requests_timeout, stream=True)
content = requests_get(self.__r)
self.__r.raise_for_status()
self.__code = self.__get_code(url)
self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(content)
self.__set_cookies(self.__r.cookies)
# self.reply, self.start_cursor, self.has_previous = instaparser.parse_reply_ajax(self.__r.content)
self.reply, self.start_cursor, self.has_previous = instaparser.parse_reply_ajax(content)
self.__r.close()
self.log_load_reply_more_after()
# self.log_load_reply_more_after()
return self.reply
def get_cookies(self):
@@ -662,7 +700,7 @@ class InstaContent:
return self.proxies
def log_load_reply_more_before(self, form_data, headers):
if is_debuging:
if is_debug:
printl("<ContentReply Start>")
printl("<ContentReply requests>")
printl('start_cursor = ' + self.start_cursor)
@@ -671,7 +709,7 @@ class InstaContent:
printl(headers)
def log_load_reply_more_after(self):
if is_debuging:
if is_debug:
printl("<ContentReply response>")
printl('self.__r.cookies=', end='')
printl(self.__r.cookies)
@@ -722,7 +760,7 @@ class InstaAlgorithm:
self.close()
def close(self):
if self.driver and not is_debuging:
if self.driver and not is_debug:
self.driver.quit()
self.send_to_db.close()
printl("Finished Crawling :)")
@@ -760,6 +798,7 @@ class InstaAlgorithm:
# printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
# wait(1.5)
# self.crawl_content(element['url'], list_crawler.get_cookies(), list_crawler.get_url())
printl("element insert to queue {}".format(element['url']))
self.list_crawl.append(element)
backup_set.add(element['url'])
if self.is_until_page():
@@ -897,6 +936,7 @@ class InstaAlgorithmMulti(InstaAlgorithm):
# p_list = [multiprocessing.Process(target=crawl_content_process,
# args=(self.list_crawl, self.keyword_id, self.send_to_db.db_num))
# for i in range(num_of_content_process)]
printl("{} processs start".format(num_of_content_process))
p_list = [threading.Thread(target=crawl_content_process,
args=(self.list_crawl, self.keyword_id, self.send_to_db.db_num))
for i in range(num_of_content_process)]