- instagram 크롤러 수정 (Tag, Comment, User)
- Debug 메시지 수정 git-svn-id: svn://192.168.0.12/source@351 8346c931-da38-4b9b-9d4c-e48b93cbd075
This commit is contained in:
@@ -12,6 +12,7 @@ import threading
|
|||||||
import re
|
import re
|
||||||
import pymysql
|
import pymysql
|
||||||
import random
|
import random
|
||||||
|
import inspect
|
||||||
|
|
||||||
from time import localtime, strftime
|
from time import localtime, strftime
|
||||||
|
|
||||||
@@ -24,13 +25,37 @@ from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
|||||||
|
|
||||||
is_debug = False
|
is_debug = False
|
||||||
|
|
||||||
|
def is_debugger_attached():
|
||||||
|
for frame in inspect.stack():
|
||||||
|
if frame[1].endswith("pydevd.py"):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
def printl(*objects, sep=' ', end='\n', file=None, flush=True):
|
def printl(*objects, sep=' ', end='\n', file=None, flush=True):
|
||||||
print(*objects, sep=sep, end=end, file=file, flush=flush)
|
if is_debug:
|
||||||
|
cur_frame = inspect.currentframe()
|
||||||
|
call_frame = inspect.getouterframes(cur_frame, 2)
|
||||||
|
|
||||||
|
frame_no = call_frame[1][3] == 'printd' and 2 or 1
|
||||||
|
file_path = call_frame[frame_no][1]
|
||||||
|
line_no = call_frame[frame_no][2]
|
||||||
|
# class_name = ''
|
||||||
|
# if 'self' in call_frame[frame_no][0].f_locals:
|
||||||
|
# class_name = str(call_frame[frame_no][0].f_locals['self'].__class__)
|
||||||
|
# method_name = call_frame[frame_no][3]
|
||||||
|
|
||||||
|
try:
|
||||||
|
# print('{}({}) [{}.{}] : {}'.format(file_path, line_no, class_name, method_name, *objects), sep=sep, end=end, file=file, flush=flush)
|
||||||
|
print('{}({}) : {}'.format(file_path, line_no, *objects), sep=sep, end=end, file=file, flush=flush)
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
else:
|
||||||
|
print(*objects, sep=sep, end=end, file=file, flush=flush)
|
||||||
|
|
||||||
|
|
||||||
def printd(*objects, sep=' ', end='\n', file=None, flush=True):
|
def printd(*objects, sep=' ', end='\n', file=None, flush=True):
|
||||||
if is_debug:
|
if is_debug:
|
||||||
print(*objects, sep=sep, end=end, file=file, flush=flush)
|
printl(objects, sep, end, file, flush)
|
||||||
|
|
||||||
|
|
||||||
def print_and_flush(string):
|
def print_and_flush(string):
|
||||||
|
|||||||
@@ -16,32 +16,52 @@ from queue import Queue
|
|||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
import sys
|
import sys
|
||||||
|
import inspect
|
||||||
|
|
||||||
|
|
||||||
from base.baseclasses import SendtoDB
|
from base.baseclasses import SendtoDB
|
||||||
from base.baseclasses import CrawlInit
|
from base.baseclasses import CrawlInit
|
||||||
from base.baseclasses import wait
|
from base.baseclasses import wait
|
||||||
|
from base.baseclasses import is_debugger_attached
|
||||||
# from base.baseclasses import Browser
|
# from base.baseclasses import Browser
|
||||||
from selenium.webdriver.common.keys import Keys
|
from selenium.webdriver.common.keys import Keys
|
||||||
from base.baseclasses import enter_element
|
from base.baseclasses import enter_element
|
||||||
import base.proxy
|
import base.proxy
|
||||||
import eventlet
|
import eventlet
|
||||||
|
|
||||||
|
|
||||||
def printl(*objects, sep=' ', end='\n', file=None, flush=True):
|
def printl(*objects, sep=' ', end='\n', file=None, flush=True):
|
||||||
print(*objects, sep=sep, end=end, file=file, flush=flush)
|
if is_debug:
|
||||||
|
cur_frame = inspect.currentframe()
|
||||||
|
call_frame = inspect.getouterframes(cur_frame, 2)
|
||||||
|
|
||||||
|
frame_no = call_frame[1][3] == 'printd' and 2 or 1
|
||||||
|
file_path = call_frame[frame_no][1]
|
||||||
|
line_no = call_frame[frame_no][2]
|
||||||
|
# class_name = ''
|
||||||
|
# if 'self' in call_frame[frame_no][0].f_locals:
|
||||||
|
# class_name = str(call_frame[frame_no][0].f_locals['self'].__class__)
|
||||||
|
# method_name = call_frame[frame_no][3]
|
||||||
|
|
||||||
|
try:
|
||||||
|
# print('{}({}) [{}.{}] : {}'.format(file_path, line_no, class_name, method_name, *objects), sep=sep, end=end, file=file, flush=flush)
|
||||||
|
print('{}({}) : {}'.format(file_path, line_no, *objects), sep=sep, end=end, file=file, flush=flush)
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
else:
|
||||||
|
print(*objects, sep=sep, end=end, file=file, flush=flush)
|
||||||
|
|
||||||
insta_url = "https://www.instagram.com/"
|
insta_url = "https://www.instagram.com/"
|
||||||
insta_tag_url = "https://www.instagram.com/explore/tags/"
|
insta_tag_url = "https://www.instagram.com/explore/tags/"
|
||||||
insta_query = "https://www.instagram.com/query/"
|
insta_query = "https://www.instagram.com/query/"
|
||||||
insta_body_url = 'https://www.instagram.com/p/'
|
insta_body_url = 'https://www.instagram.com/p/'
|
||||||
|
|
||||||
is_debuging = False
|
is_debug = is_debugger_attached()
|
||||||
is_debug = False
|
|
||||||
|
|
||||||
|
|
||||||
def printd(*objects, sep=' ', end='\n', file=None, flush=True):
|
def printd(*objects, sep=' ', end='\n', file=None, flush=True):
|
||||||
if is_debug:
|
if is_debug:
|
||||||
print(*objects, sep=sep, end=end, file=file, flush=flush)
|
printl(*objects, sep=sep, end=end, file=file, flush=flush)
|
||||||
|
|
||||||
|
|
||||||
num_of_list_ajax = 24
|
num_of_list_ajax = 24
|
||||||
@@ -232,7 +252,7 @@ def ajax_wrapper(func):
|
|||||||
return retry_ajax_load
|
return retry_ajax_load
|
||||||
|
|
||||||
|
|
||||||
@ajax_wrapper
|
# @ajax_wrapper
|
||||||
def load_ajax_list(ins):
|
def load_ajax_list(ins):
|
||||||
try:
|
try:
|
||||||
insta_list = ins.load_more()
|
insta_list = ins.load_more()
|
||||||
@@ -247,7 +267,7 @@ def load_ajax_list(ins):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
@ajax_wrapper
|
# @ajax_wrapper
|
||||||
def load_ajax_reply(ins):
|
def load_ajax_reply(ins):
|
||||||
try:
|
try:
|
||||||
replies = ins.load_reply_more()
|
replies = ins.load_reply_more()
|
||||||
@@ -307,7 +327,7 @@ def crawl_content_process(qu, keyword_id, db_num):
|
|||||||
try:
|
try:
|
||||||
element = qu.get(timeout=60)
|
element = qu.get(timeout=60)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
printl("getting queue is timeout")
|
printl("[crawl_content_process] queue is empty")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if element is None:
|
if element is None:
|
||||||
@@ -428,18 +448,35 @@ class ListTag:
|
|||||||
return self.list_tag
|
return self.list_tag
|
||||||
|
|
||||||
def load_more(self):
|
def load_more(self):
|
||||||
form_data = instaheaders.get_form_data_for_list_tag(self.__tag, self.end_cursor, num_of_list_ajax)
|
url = self.__url + "?max_id="+self.end_cursor
|
||||||
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data)
|
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html(), proxies=self.proxies,
|
||||||
self.log_load_more_before(form_data, headers)
|
timeout=requests_timeout, stream=True)
|
||||||
self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies,
|
|
||||||
timeout=requests_timeout, stream=True)
|
|
||||||
content = requests_get(self.__r)
|
content = requests_get(self.__r)
|
||||||
self.__set_cookies(self.__r.cookies)
|
|
||||||
|
self.log_load_url_before()
|
||||||
self.__r.raise_for_status()
|
self.__r.raise_for_status()
|
||||||
# self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content)
|
self.__tag = self.__get_tag(url)
|
||||||
self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_ajax(content)
|
self.__set_cookies(self.__r.cookies)
|
||||||
|
# self.__url = url
|
||||||
|
# self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(self.__r.content)
|
||||||
|
self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(content)
|
||||||
self.__r.close()
|
self.__r.close()
|
||||||
self.log_load_more_after()
|
self.log_load_url_after()
|
||||||
|
|
||||||
|
# 기존 방식 instagram에서 post를 막은 듯
|
||||||
|
# form_data = instaheaders.get_form_data_for_list_tag(self.__tag, self.end_cursor, num_of_list_ajax)
|
||||||
|
# headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data)
|
||||||
|
# self.log_load_more_before(form_data, headers)
|
||||||
|
# self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies,
|
||||||
|
# timeout=requests_timeout, stream=True)
|
||||||
|
# content = requests_get(self.__r)
|
||||||
|
# self.__set_cookies(self.__r.cookies)
|
||||||
|
# self.__r.raise_for_status()
|
||||||
|
# # self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content)
|
||||||
|
# self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_ajax(content)
|
||||||
|
# self.__r.close()
|
||||||
|
# self.log_load_more_after()
|
||||||
|
|
||||||
return self.list_tag
|
return self.list_tag
|
||||||
|
|
||||||
def __get_tag(self, url):
|
def __get_tag(self, url):
|
||||||
@@ -472,14 +509,14 @@ class ListTag:
|
|||||||
return self.proxies
|
return self.proxies
|
||||||
|
|
||||||
def log_load_url_before(self):
|
def log_load_url_before(self):
|
||||||
if is_debuging:
|
if is_debug:
|
||||||
printl("<ListTag Start>")
|
printl("<ListTag Start>")
|
||||||
printl("<ListTag requests>")
|
printl("<ListTag requests>")
|
||||||
printl('headers = ', end=' ')
|
printl('headers = ', end=' ')
|
||||||
printl(instaheaders.get_headers_for_list_html())
|
printl(instaheaders.get_headers_for_list_html())
|
||||||
|
|
||||||
def log_load_url_after(self):
|
def log_load_url_after(self):
|
||||||
if is_debuging:
|
if is_debug:
|
||||||
printl("<ListTag response>")
|
printl("<ListTag response>")
|
||||||
printl('self.__r.cookies=', end='')
|
printl('self.__r.cookies=', end='')
|
||||||
printl(self.__r.cookies)
|
printl(self.__r.cookies)
|
||||||
@@ -491,7 +528,7 @@ class ListTag:
|
|||||||
printl("<ListTag End>")
|
printl("<ListTag End>")
|
||||||
|
|
||||||
def log_load_more_before(self, form_data, headers):
|
def log_load_more_before(self, form_data, headers):
|
||||||
if is_debuging:
|
if is_debug:
|
||||||
printl("<ListTag Start>")
|
printl("<ListTag Start>")
|
||||||
printl("<ListTag requests>")
|
printl("<ListTag requests>")
|
||||||
printl('end_cursor = ' + str(self.end_cursor))
|
printl('end_cursor = ' + str(self.end_cursor))
|
||||||
@@ -500,7 +537,7 @@ class ListTag:
|
|||||||
printl(headers)
|
printl(headers)
|
||||||
|
|
||||||
def log_load_more_after(self):
|
def log_load_more_after(self):
|
||||||
if is_debuging:
|
if is_debug:
|
||||||
printl("<ListTag response>")
|
printl("<ListTag response>")
|
||||||
printl('self.__r.cookies=', end='')
|
printl('self.__r.cookies=', end='')
|
||||||
printl(self.__r.cookies)
|
printl(self.__r.cookies)
|
||||||
@@ -537,19 +574,21 @@ class ListUser:
|
|||||||
return self.list_user
|
return self.list_user
|
||||||
|
|
||||||
def load_more(self):
|
def load_more(self):
|
||||||
form_data = instaheaders.get_form_data_for_list_user(self.__user, self.end_cursor, num_of_list_ajax)
|
url = self.__url + "?max_id=" + self.end_cursor
|
||||||
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data)
|
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html(), proxies=self.proxies,
|
||||||
self.log_load_more_before(form_data, headers)
|
timeout=requests_timeout, stream=True)
|
||||||
self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies,
|
# form_data = instaheaders.get_form_data_for_list_user(self.__user, self.end_cursor, num_of_list_ajax)
|
||||||
timeout=requests_timeout, stream=True)
|
# headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data)
|
||||||
|
# self.log_load_more_before(form_data, headers)
|
||||||
|
# self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies,
|
||||||
|
# timeout=requests_timeout, stream=True)
|
||||||
content = requests_get(self.__r)
|
content = requests_get(self.__r)
|
||||||
self.__r.raise_for_status()
|
self.__r.raise_for_status()
|
||||||
self.__set_cookies(self.__r.cookies)
|
self.__set_cookies(self.__r.cookies)
|
||||||
|
# self.list_user, self.end_cursor, self.has_next = instaparser.parse_list_ajax(content)
|
||||||
# self.list_user, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content)
|
self.list_user, self.end_cursor, self.has_next, self.__user = instaparser.parse_list_user_html(content)
|
||||||
self.list_user, self.end_cursor, self.has_next = instaparser.parse_list_ajax(content)
|
|
||||||
self.__r.close()
|
self.__r.close()
|
||||||
self.log_load_more_after()
|
# self.log_load_more_after()
|
||||||
return self.list_user
|
return self.list_user
|
||||||
|
|
||||||
def get_cookies(self):
|
def get_cookies(self):
|
||||||
@@ -575,7 +614,7 @@ class ListUser:
|
|||||||
return self.proxies
|
return self.proxies
|
||||||
|
|
||||||
def log_load_more_before(self, form_data, headers):
|
def log_load_more_before(self, form_data, headers):
|
||||||
if is_debuging:
|
if is_debug:
|
||||||
printl("<ListUser Start>")
|
printl("<ListUser Start>")
|
||||||
printl("<ListUser requests>")
|
printl("<ListUser requests>")
|
||||||
printl('end_cursor = ' + str(self.end_cursor))
|
printl('end_cursor = ' + str(self.end_cursor))
|
||||||
@@ -584,7 +623,7 @@ class ListUser:
|
|||||||
printl(headers)
|
printl(headers)
|
||||||
|
|
||||||
def log_load_more_after(self):
|
def log_load_more_after(self):
|
||||||
if is_debuging:
|
if is_debug:
|
||||||
printl("<ListUser response>")
|
printl("<ListUser response>")
|
||||||
printl('self.__r.cookies=', end='')
|
printl('self.__r.cookies=', end='')
|
||||||
printl(self.__r.cookies)
|
printl(self.__r.cookies)
|
||||||
@@ -630,18 +669,17 @@ class InstaContent:
|
|||||||
return self.reply
|
return self.reply
|
||||||
|
|
||||||
def load_reply_more(self):
|
def load_reply_more(self):
|
||||||
form_data = instaheaders.get_form_data_for_reply(self.__code, self.start_cursor, num_of_reply_ajax)
|
url = self.__referer + "?max_id="+self.start_cursor
|
||||||
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__referer, form_data)
|
# self.log_load_reply_more_before(form_data, headers)
|
||||||
self.log_load_reply_more_before(form_data, headers)
|
self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies), proxies=self.proxies,
|
||||||
self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies,
|
timeout=requests_timeout, stream=True)
|
||||||
timeout=requests_timeout, stream=True)
|
|
||||||
content = requests_get(self.__r)
|
content = requests_get(self.__r)
|
||||||
self.__r.raise_for_status()
|
self.__r.raise_for_status()
|
||||||
|
self.__code = self.__get_code(url)
|
||||||
|
self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(content)
|
||||||
self.__set_cookies(self.__r.cookies)
|
self.__set_cookies(self.__r.cookies)
|
||||||
# self.reply, self.start_cursor, self.has_previous = instaparser.parse_reply_ajax(self.__r.content)
|
|
||||||
self.reply, self.start_cursor, self.has_previous = instaparser.parse_reply_ajax(content)
|
|
||||||
self.__r.close()
|
self.__r.close()
|
||||||
self.log_load_reply_more_after()
|
# self.log_load_reply_more_after()
|
||||||
return self.reply
|
return self.reply
|
||||||
|
|
||||||
def get_cookies(self):
|
def get_cookies(self):
|
||||||
@@ -662,7 +700,7 @@ class InstaContent:
|
|||||||
return self.proxies
|
return self.proxies
|
||||||
|
|
||||||
def log_load_reply_more_before(self, form_data, headers):
|
def log_load_reply_more_before(self, form_data, headers):
|
||||||
if is_debuging:
|
if is_debug:
|
||||||
printl("<ContentReply Start>")
|
printl("<ContentReply Start>")
|
||||||
printl("<ContentReply requests>")
|
printl("<ContentReply requests>")
|
||||||
printl('start_cursor = ' + self.start_cursor)
|
printl('start_cursor = ' + self.start_cursor)
|
||||||
@@ -671,7 +709,7 @@ class InstaContent:
|
|||||||
printl(headers)
|
printl(headers)
|
||||||
|
|
||||||
def log_load_reply_more_after(self):
|
def log_load_reply_more_after(self):
|
||||||
if is_debuging:
|
if is_debug:
|
||||||
printl("<ContentReply response>")
|
printl("<ContentReply response>")
|
||||||
printl('self.__r.cookies=', end='')
|
printl('self.__r.cookies=', end='')
|
||||||
printl(self.__r.cookies)
|
printl(self.__r.cookies)
|
||||||
@@ -722,7 +760,7 @@ class InstaAlgorithm:
|
|||||||
self.close()
|
self.close()
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
if self.driver and not is_debuging:
|
if self.driver and not is_debug:
|
||||||
self.driver.quit()
|
self.driver.quit()
|
||||||
self.send_to_db.close()
|
self.send_to_db.close()
|
||||||
printl("Finished Crawling :)")
|
printl("Finished Crawling :)")
|
||||||
@@ -760,6 +798,7 @@ class InstaAlgorithm:
|
|||||||
# printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
|
# printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
|
||||||
# wait(1.5)
|
# wait(1.5)
|
||||||
# self.crawl_content(element['url'], list_crawler.get_cookies(), list_crawler.get_url())
|
# self.crawl_content(element['url'], list_crawler.get_cookies(), list_crawler.get_url())
|
||||||
|
printl("element insert to queue {}".format(element['url']))
|
||||||
self.list_crawl.append(element)
|
self.list_crawl.append(element)
|
||||||
backup_set.add(element['url'])
|
backup_set.add(element['url'])
|
||||||
if self.is_until_page():
|
if self.is_until_page():
|
||||||
@@ -897,6 +936,7 @@ class InstaAlgorithmMulti(InstaAlgorithm):
|
|||||||
# p_list = [multiprocessing.Process(target=crawl_content_process,
|
# p_list = [multiprocessing.Process(target=crawl_content_process,
|
||||||
# args=(self.list_crawl, self.keyword_id, self.send_to_db.db_num))
|
# args=(self.list_crawl, self.keyword_id, self.send_to_db.db_num))
|
||||||
# for i in range(num_of_content_process)]
|
# for i in range(num_of_content_process)]
|
||||||
|
printl("{} processs start".format(num_of_content_process))
|
||||||
p_list = [threading.Thread(target=crawl_content_process,
|
p_list = [threading.Thread(target=crawl_content_process,
|
||||||
args=(self.list_crawl, self.keyword_id, self.send_to_db.db_num))
|
args=(self.list_crawl, self.keyword_id, self.send_to_db.db_num))
|
||||||
for i in range(num_of_content_process)]
|
for i in range(num_of_content_process)]
|
||||||
|
|||||||
Reference in New Issue
Block a user