- instagram 크롤러 수정 (Tag, Comment, User)
- Debug 메시지 수정 git-svn-id: svn://192.168.0.12/source@351 8346c931-da38-4b9b-9d4c-e48b93cbd075
This commit is contained in:
@@ -12,6 +12,7 @@ import threading
|
||||
import re
|
||||
import pymysql
|
||||
import random
|
||||
import inspect
|
||||
|
||||
from time import localtime, strftime
|
||||
|
||||
@@ -24,13 +25,37 @@ from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
||||
|
||||
is_debug = False
|
||||
|
||||
def is_debugger_attached():
|
||||
for frame in inspect.stack():
|
||||
if frame[1].endswith("pydevd.py"):
|
||||
return True
|
||||
return False
|
||||
|
||||
def printl(*objects, sep=' ', end='\n', file=None, flush=True):
|
||||
if is_debug:
|
||||
cur_frame = inspect.currentframe()
|
||||
call_frame = inspect.getouterframes(cur_frame, 2)
|
||||
|
||||
frame_no = call_frame[1][3] == 'printd' and 2 or 1
|
||||
file_path = call_frame[frame_no][1]
|
||||
line_no = call_frame[frame_no][2]
|
||||
# class_name = ''
|
||||
# if 'self' in call_frame[frame_no][0].f_locals:
|
||||
# class_name = str(call_frame[frame_no][0].f_locals['self'].__class__)
|
||||
# method_name = call_frame[frame_no][3]
|
||||
|
||||
try:
|
||||
# print('{}({}) [{}.{}] : {}'.format(file_path, line_no, class_name, method_name, *objects), sep=sep, end=end, file=file, flush=flush)
|
||||
print('{}({}) : {}'.format(file_path, line_no, *objects), sep=sep, end=end, file=file, flush=flush)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
else:
|
||||
print(*objects, sep=sep, end=end, file=file, flush=flush)
|
||||
|
||||
|
||||
def printd(*objects, sep=' ', end='\n', file=None, flush=True):
|
||||
if is_debug:
|
||||
print(*objects, sep=sep, end=end, file=file, flush=flush)
|
||||
printl(objects, sep, end, file, flush)
|
||||
|
||||
|
||||
def print_and_flush(string):
|
||||
|
||||
@@ -16,18 +16,39 @@ from queue import Queue
|
||||
import threading
|
||||
import time
|
||||
import sys
|
||||
import inspect
|
||||
|
||||
|
||||
from base.baseclasses import SendtoDB
|
||||
from base.baseclasses import CrawlInit
|
||||
from base.baseclasses import wait
|
||||
from base.baseclasses import is_debugger_attached
|
||||
# from base.baseclasses import Browser
|
||||
from selenium.webdriver.common.keys import Keys
|
||||
from base.baseclasses import enter_element
|
||||
import base.proxy
|
||||
import eventlet
|
||||
|
||||
|
||||
def printl(*objects, sep=' ', end='\n', file=None, flush=True):
|
||||
if is_debug:
|
||||
cur_frame = inspect.currentframe()
|
||||
call_frame = inspect.getouterframes(cur_frame, 2)
|
||||
|
||||
frame_no = call_frame[1][3] == 'printd' and 2 or 1
|
||||
file_path = call_frame[frame_no][1]
|
||||
line_no = call_frame[frame_no][2]
|
||||
# class_name = ''
|
||||
# if 'self' in call_frame[frame_no][0].f_locals:
|
||||
# class_name = str(call_frame[frame_no][0].f_locals['self'].__class__)
|
||||
# method_name = call_frame[frame_no][3]
|
||||
|
||||
try:
|
||||
# print('{}({}) [{}.{}] : {}'.format(file_path, line_no, class_name, method_name, *objects), sep=sep, end=end, file=file, flush=flush)
|
||||
print('{}({}) : {}'.format(file_path, line_no, *objects), sep=sep, end=end, file=file, flush=flush)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
else:
|
||||
print(*objects, sep=sep, end=end, file=file, flush=flush)
|
||||
|
||||
insta_url = "https://www.instagram.com/"
|
||||
@@ -35,13 +56,12 @@ insta_tag_url = "https://www.instagram.com/explore/tags/"
|
||||
insta_query = "https://www.instagram.com/query/"
|
||||
insta_body_url = 'https://www.instagram.com/p/'
|
||||
|
||||
is_debuging = False
|
||||
is_debug = False
|
||||
is_debug = is_debugger_attached()
|
||||
|
||||
|
||||
def printd(*objects, sep=' ', end='\n', file=None, flush=True):
|
||||
if is_debug:
|
||||
print(*objects, sep=sep, end=end, file=file, flush=flush)
|
||||
printl(*objects, sep=sep, end=end, file=file, flush=flush)
|
||||
|
||||
|
||||
num_of_list_ajax = 24
|
||||
@@ -232,7 +252,7 @@ def ajax_wrapper(func):
|
||||
return retry_ajax_load
|
||||
|
||||
|
||||
@ajax_wrapper
|
||||
# @ajax_wrapper
|
||||
def load_ajax_list(ins):
|
||||
try:
|
||||
insta_list = ins.load_more()
|
||||
@@ -247,7 +267,7 @@ def load_ajax_list(ins):
|
||||
return None
|
||||
|
||||
|
||||
@ajax_wrapper
|
||||
# @ajax_wrapper
|
||||
def load_ajax_reply(ins):
|
||||
try:
|
||||
replies = ins.load_reply_more()
|
||||
@@ -307,7 +327,7 @@ def crawl_content_process(qu, keyword_id, db_num):
|
||||
try:
|
||||
element = qu.get(timeout=60)
|
||||
except Exception as e:
|
||||
printl("getting queue is timeout")
|
||||
printl("[crawl_content_process] queue is empty")
|
||||
continue
|
||||
|
||||
if element is None:
|
||||
@@ -428,18 +448,35 @@ class ListTag:
|
||||
return self.list_tag
|
||||
|
||||
def load_more(self):
|
||||
form_data = instaheaders.get_form_data_for_list_tag(self.__tag, self.end_cursor, num_of_list_ajax)
|
||||
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data)
|
||||
self.log_load_more_before(form_data, headers)
|
||||
self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies,
|
||||
url = self.__url + "?max_id="+self.end_cursor
|
||||
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html(), proxies=self.proxies,
|
||||
timeout=requests_timeout, stream=True)
|
||||
content = requests_get(self.__r)
|
||||
self.__set_cookies(self.__r.cookies)
|
||||
|
||||
self.log_load_url_before()
|
||||
self.__r.raise_for_status()
|
||||
# self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content)
|
||||
self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_ajax(content)
|
||||
self.__tag = self.__get_tag(url)
|
||||
self.__set_cookies(self.__r.cookies)
|
||||
# self.__url = url
|
||||
# self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(self.__r.content)
|
||||
self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(content)
|
||||
self.__r.close()
|
||||
self.log_load_more_after()
|
||||
self.log_load_url_after()
|
||||
|
||||
# 기존 방식 instagram에서 post를 막은 듯
|
||||
# form_data = instaheaders.get_form_data_for_list_tag(self.__tag, self.end_cursor, num_of_list_ajax)
|
||||
# headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data)
|
||||
# self.log_load_more_before(form_data, headers)
|
||||
# self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies,
|
||||
# timeout=requests_timeout, stream=True)
|
||||
# content = requests_get(self.__r)
|
||||
# self.__set_cookies(self.__r.cookies)
|
||||
# self.__r.raise_for_status()
|
||||
# # self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content)
|
||||
# self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_ajax(content)
|
||||
# self.__r.close()
|
||||
# self.log_load_more_after()
|
||||
|
||||
return self.list_tag
|
||||
|
||||
def __get_tag(self, url):
|
||||
@@ -472,14 +509,14 @@ class ListTag:
|
||||
return self.proxies
|
||||
|
||||
def log_load_url_before(self):
|
||||
if is_debuging:
|
||||
if is_debug:
|
||||
printl("<ListTag Start>")
|
||||
printl("<ListTag requests>")
|
||||
printl('headers = ', end=' ')
|
||||
printl(instaheaders.get_headers_for_list_html())
|
||||
|
||||
def log_load_url_after(self):
|
||||
if is_debuging:
|
||||
if is_debug:
|
||||
printl("<ListTag response>")
|
||||
printl('self.__r.cookies=', end='')
|
||||
printl(self.__r.cookies)
|
||||
@@ -491,7 +528,7 @@ class ListTag:
|
||||
printl("<ListTag End>")
|
||||
|
||||
def log_load_more_before(self, form_data, headers):
|
||||
if is_debuging:
|
||||
if is_debug:
|
||||
printl("<ListTag Start>")
|
||||
printl("<ListTag requests>")
|
||||
printl('end_cursor = ' + str(self.end_cursor))
|
||||
@@ -500,7 +537,7 @@ class ListTag:
|
||||
printl(headers)
|
||||
|
||||
def log_load_more_after(self):
|
||||
if is_debuging:
|
||||
if is_debug:
|
||||
printl("<ListTag response>")
|
||||
printl('self.__r.cookies=', end='')
|
||||
printl(self.__r.cookies)
|
||||
@@ -537,19 +574,21 @@ class ListUser:
|
||||
return self.list_user
|
||||
|
||||
def load_more(self):
|
||||
form_data = instaheaders.get_form_data_for_list_user(self.__user, self.end_cursor, num_of_list_ajax)
|
||||
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data)
|
||||
self.log_load_more_before(form_data, headers)
|
||||
self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies,
|
||||
url = self.__url + "?max_id=" + self.end_cursor
|
||||
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html(), proxies=self.proxies,
|
||||
timeout=requests_timeout, stream=True)
|
||||
# form_data = instaheaders.get_form_data_for_list_user(self.__user, self.end_cursor, num_of_list_ajax)
|
||||
# headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data)
|
||||
# self.log_load_more_before(form_data, headers)
|
||||
# self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies,
|
||||
# timeout=requests_timeout, stream=True)
|
||||
content = requests_get(self.__r)
|
||||
self.__r.raise_for_status()
|
||||
self.__set_cookies(self.__r.cookies)
|
||||
|
||||
# self.list_user, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content)
|
||||
self.list_user, self.end_cursor, self.has_next = instaparser.parse_list_ajax(content)
|
||||
# self.list_user, self.end_cursor, self.has_next = instaparser.parse_list_ajax(content)
|
||||
self.list_user, self.end_cursor, self.has_next, self.__user = instaparser.parse_list_user_html(content)
|
||||
self.__r.close()
|
||||
self.log_load_more_after()
|
||||
# self.log_load_more_after()
|
||||
return self.list_user
|
||||
|
||||
def get_cookies(self):
|
||||
@@ -575,7 +614,7 @@ class ListUser:
|
||||
return self.proxies
|
||||
|
||||
def log_load_more_before(self, form_data, headers):
|
||||
if is_debuging:
|
||||
if is_debug:
|
||||
printl("<ListUser Start>")
|
||||
printl("<ListUser requests>")
|
||||
printl('end_cursor = ' + str(self.end_cursor))
|
||||
@@ -584,7 +623,7 @@ class ListUser:
|
||||
printl(headers)
|
||||
|
||||
def log_load_more_after(self):
|
||||
if is_debuging:
|
||||
if is_debug:
|
||||
printl("<ListUser response>")
|
||||
printl('self.__r.cookies=', end='')
|
||||
printl(self.__r.cookies)
|
||||
@@ -630,18 +669,17 @@ class InstaContent:
|
||||
return self.reply
|
||||
|
||||
def load_reply_more(self):
|
||||
form_data = instaheaders.get_form_data_for_reply(self.__code, self.start_cursor, num_of_reply_ajax)
|
||||
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__referer, form_data)
|
||||
self.log_load_reply_more_before(form_data, headers)
|
||||
self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies,
|
||||
url = self.__referer + "?max_id="+self.start_cursor
|
||||
# self.log_load_reply_more_before(form_data, headers)
|
||||
self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies), proxies=self.proxies,
|
||||
timeout=requests_timeout, stream=True)
|
||||
content = requests_get(self.__r)
|
||||
self.__r.raise_for_status()
|
||||
self.__code = self.__get_code(url)
|
||||
self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(content)
|
||||
self.__set_cookies(self.__r.cookies)
|
||||
# self.reply, self.start_cursor, self.has_previous = instaparser.parse_reply_ajax(self.__r.content)
|
||||
self.reply, self.start_cursor, self.has_previous = instaparser.parse_reply_ajax(content)
|
||||
self.__r.close()
|
||||
self.log_load_reply_more_after()
|
||||
# self.log_load_reply_more_after()
|
||||
return self.reply
|
||||
|
||||
def get_cookies(self):
|
||||
@@ -662,7 +700,7 @@ class InstaContent:
|
||||
return self.proxies
|
||||
|
||||
def log_load_reply_more_before(self, form_data, headers):
|
||||
if is_debuging:
|
||||
if is_debug:
|
||||
printl("<ContentReply Start>")
|
||||
printl("<ContentReply requests>")
|
||||
printl('start_cursor = ' + self.start_cursor)
|
||||
@@ -671,7 +709,7 @@ class InstaContent:
|
||||
printl(headers)
|
||||
|
||||
def log_load_reply_more_after(self):
|
||||
if is_debuging:
|
||||
if is_debug:
|
||||
printl("<ContentReply response>")
|
||||
printl('self.__r.cookies=', end='')
|
||||
printl(self.__r.cookies)
|
||||
@@ -722,7 +760,7 @@ class InstaAlgorithm:
|
||||
self.close()
|
||||
|
||||
def close(self):
|
||||
if self.driver and not is_debuging:
|
||||
if self.driver and not is_debug:
|
||||
self.driver.quit()
|
||||
self.send_to_db.close()
|
||||
printl("Finished Crawling :)")
|
||||
@@ -760,6 +798,7 @@ class InstaAlgorithm:
|
||||
# printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
|
||||
# wait(1.5)
|
||||
# self.crawl_content(element['url'], list_crawler.get_cookies(), list_crawler.get_url())
|
||||
printl("element insert to queue {}".format(element['url']))
|
||||
self.list_crawl.append(element)
|
||||
backup_set.add(element['url'])
|
||||
if self.is_until_page():
|
||||
@@ -897,6 +936,7 @@ class InstaAlgorithmMulti(InstaAlgorithm):
|
||||
# p_list = [multiprocessing.Process(target=crawl_content_process,
|
||||
# args=(self.list_crawl, self.keyword_id, self.send_to_db.db_num))
|
||||
# for i in range(num_of_content_process)]
|
||||
printl("{} processs start".format(num_of_content_process))
|
||||
p_list = [threading.Thread(target=crawl_content_process,
|
||||
args=(self.list_crawl, self.keyword_id, self.send_to_db.db_num))
|
||||
for i in range(num_of_content_process)]
|
||||
|
||||
Reference in New Issue
Block a user