instagram 멀티로 실행하게 만들기
git-svn-id: svn://192.168.0.12/source@287 8346c931-da38-4b9b-9d4c-e48b93cbd075
This commit is contained in:
@@ -10,6 +10,9 @@ import os
|
||||
import psutil
|
||||
import threading
|
||||
import re
|
||||
import pymysql
|
||||
import random
|
||||
|
||||
from time import localtime, strftime
|
||||
|
||||
from selenium import webdriver
|
||||
@@ -19,6 +22,7 @@ from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
||||
|
||||
|
||||
def print_and_flush(string):
|
||||
print(string)
|
||||
sys.stdout.flush()
|
||||
@@ -77,7 +81,6 @@ def find_elements_by_xpath(driver, tag, time=0):
|
||||
)
|
||||
return elements
|
||||
|
||||
|
||||
class Browser:
|
||||
def __init__(self, driver=None):
|
||||
self.driver = driver
|
||||
@@ -120,7 +123,8 @@ class Browser:
|
||||
self.chrome_basename = os.path.basename(driver_exec)
|
||||
if self.is_server_executed(self.chrome_basename):
|
||||
port = self.port(self.chrome_basename)
|
||||
self.driver = webdriver.Remote(command_executor="http://127.0.0.1:"+port, desired_capabilities=webdriver.DesiredCapabilities.CHROME)
|
||||
self.driver = webdriver.Remote(command_executor="http://127.0.0.1:"+port,
|
||||
desired_capabilities=webdriver.DesiredCapabilities.CHROME)
|
||||
else:
|
||||
self.driver = webdriver.Chrome(self.chrome_driver_path)
|
||||
return self.driver
|
||||
@@ -151,9 +155,11 @@ class Browser:
|
||||
self.opera_basename = os.path.basename(driver_exec)
|
||||
if self.is_server_executed(self.opera_basename):
|
||||
port = self.port(self.opera_basename)
|
||||
self.driver = webdriver.Remote(command_executor="http://127.0.0.1:"+port, desired_capabilities=webdriver.DesiredCapabilities.OPERA)
|
||||
self.driver = webdriver.Remote(command_executor="http://127.0.0.1:"+port,
|
||||
desired_capabilities=webdriver.DesiredCapabilities.OPERA)
|
||||
else:
|
||||
self.driver = webdriver.Opera(desired_capabilities=webdriver.DesiredCapabilities.OPERA, executable_path=self.opera_driver_path)
|
||||
self.driver = webdriver.Opera(desired_capabilities=webdriver.DesiredCapabilities.OPERA,
|
||||
executable_path=self.opera_driver_path)
|
||||
return self.driver
|
||||
|
||||
def driver(self):
|
||||
@@ -244,9 +250,9 @@ class SendtoDB:
|
||||
if e.args[0] == 2013 or e.args[0] == 2006 or e.args[0] == 10054: # Lost connection to server
|
||||
print("connection lost. try to reconnection")
|
||||
self.conn = self.pymysql.connect(host='bigbird.iptime.org',
|
||||
user='admin', passwd='admin123',
|
||||
db='concepters', charset='utf8',
|
||||
cursorclass=self.pymysql.cursors.DictCursor)
|
||||
user='admin', passwd='admin123',
|
||||
db='concepters', charset='utf8',
|
||||
cursorclass=self.pymysql.cursors.DictCursor)
|
||||
with self.conn.cursor() as cursor:
|
||||
cursor.execute(query)
|
||||
self.conn.commit()
|
||||
@@ -319,7 +325,7 @@ class CrawlInit:
|
||||
pymysql = __import__('pymysql.cursors')
|
||||
|
||||
def __init__(self, before_day=0):
|
||||
self.conn = self.pymysql.connect(host ='bigbird.iptime.org',
|
||||
self.conn = self.pymysql.connect(host='bigbird.iptime.org',
|
||||
user='admin', passwd='admin123',
|
||||
db='concepters', charset='utf8',
|
||||
cursorclass=self.pymysql.cursors.DictCursor)
|
||||
|
||||
147
WebBasedCrawler/base/proxy.py
Normal file
147
WebBasedCrawler/base/proxy.py
Normal file
@@ -0,0 +1,147 @@
|
||||
import re
|
||||
import random
|
||||
import pymysql
|
||||
import os
|
||||
from selenium import webdriver
|
||||
import sys
|
||||
|
||||
proxy_filename = 'proxy.txt'
|
||||
re_ip = re.compile('([\d]{1,3}\.[\d]{1,3}\.[\d]{1,3}\.[\d]{1,3})[^\d]([\d]{2,5})')
|
||||
random.seed()
|
||||
|
||||
linux_driver_path = {
|
||||
'chrome': 'chromedriver',
|
||||
'opera': 'operadriver',
|
||||
'firefox': None,
|
||||
'ie': None
|
||||
}
|
||||
|
||||
window_driver_path = {
|
||||
'firefox': None,
|
||||
'chrome': 'chromedriver.exe',
|
||||
'ie': 'IEDriverServer.exe',
|
||||
'opera': 'operadriver.exe'
|
||||
}
|
||||
|
||||
driver_path = window_driver_path if sys.platform == 'win32' else linux_driver_path
|
||||
|
||||
platform_desired_capabilities = {
|
||||
'firefox': webdriver.DesiredCapabilities.FIREFOX,
|
||||
'ie': webdriver.DesiredCapabilities.INTERNETEXPLORER,
|
||||
'opera': webdriver.DesiredCapabilities.OPERA,
|
||||
'chrome': webdriver.DesiredCapabilities.CHROME
|
||||
}
|
||||
|
||||
platform_webdriver = {
|
||||
'firefox': webdriver.Firefox,
|
||||
'chrome': webdriver.Chrome,
|
||||
'ie': webdriver.Ie,
|
||||
'opera': webdriver.Opera
|
||||
}
|
||||
|
||||
|
||||
# pl_webdriver = {
|
||||
# 'firefox': {
|
||||
# 'path': None,
|
||||
# 'desired_capabilities': webdriver.DesiredCapabilities.FIREFOX,
|
||||
# 'webdriver': webdriver.Firefox
|
||||
# },
|
||||
# 'chrome': {
|
||||
# 'path': 'chromedriver.exe' if sys.platform == 'win32' else 'chromedriver',
|
||||
# 'desired_capabilities': webdriver.DesiredCapabilities.CHROME,
|
||||
# 'webdriver': webdriver.Chrome
|
||||
# },
|
||||
# 'ie': {
|
||||
# 'path': 'IEDriverServer.exe' if sys.platform == 'win32' else None,
|
||||
# 'desired_capabilities': webdriver.DesiredCapabilities.INTERNETEXPLORER,
|
||||
# 'webdriver': webdriver.Ie
|
||||
# },
|
||||
# 'opera': {
|
||||
# 'path': 'operadriver.exe' if sys.platform == 'win32' else 'operadriver',
|
||||
# 'desired_capabilities': webdriver.DesiredCapabilities.OPERA,
|
||||
# 'webdriver': webdriver.Opera
|
||||
# }
|
||||
# }
|
||||
|
||||
|
||||
def get_driver(platform, proxies):
|
||||
"""
|
||||
|
||||
:param platform: 'chrome', 'ie', 'opera', 'firefox'
|
||||
:param proxies: format : ip:port ex) '192.168.0.1:9999'
|
||||
:return: driver applied proxy
|
||||
"""
|
||||
# copy desired_capabilities
|
||||
desired_capabilities = platform_desired_capabilities[platform].copy()
|
||||
|
||||
# set proxy
|
||||
desired_capabilities['proxy'] = {
|
||||
'httpProxy': proxies,
|
||||
'ftpProxy': proxies,
|
||||
'sslProxy': proxies,
|
||||
'noProxy': None,
|
||||
'proxyType': 'MANUAL',
|
||||
# 'autodetect': False
|
||||
# 'autodetect': True
|
||||
}
|
||||
|
||||
# return driver applied proxy
|
||||
if platform == 'ie':
|
||||
return platform_webdriver[platform](executable_path=driver_path[platform],
|
||||
capabilities=desired_capabilities)
|
||||
|
||||
if driver_path[platform]:
|
||||
return platform_webdriver[platform](executable_path=driver_path[platform],
|
||||
desired_capabilities=desired_capabilities)
|
||||
# for firefox
|
||||
else:
|
||||
return platform_webdriver[platform](capabilities=desired_capabilities)
|
||||
|
||||
|
||||
def get_proxy_from_file(filename):
|
||||
"""
|
||||
:param filename:
|
||||
:return (ip, port): string, string
|
||||
if ip, port or filename is invalid, return (None, None)
|
||||
"""
|
||||
proxy_lists = [line.replace('\n', '') for line in open(filename) if re_ip.search(line)]
|
||||
if proxy_lists:
|
||||
m = re_ip.search(proxy_lists[random.randint(0, len(proxy_lists) - 1)])
|
||||
if m:
|
||||
return m.group(1), m.group(2)
|
||||
return (None, None)
|
||||
|
||||
|
||||
def get_proxy_from_db():
|
||||
try:
|
||||
conn = pymysql.connect(host='bigbird.iptime.org',
|
||||
user='admin', passwd='admin123',
|
||||
db='concepters', charset='utf8',
|
||||
cursorclass=pymysql.cursors.DictCursor)
|
||||
with conn.cursor() as cursor:
|
||||
cursor.execute("select * from Proxy")
|
||||
proxy_lists = [(str(i['Proxy']), str(i['Port'])) for i in cursor if i['Port']]
|
||||
conn.close()
|
||||
return proxy_lists[random.randint(0, len(proxy_lists) - 1)] if proxy_lists else (None, None)
|
||||
except:
|
||||
conn.close()
|
||||
return (None, None)
|
||||
|
||||
|
||||
def get_proxy():
|
||||
if os.path.exists(proxy_filename) and os.path.isfile(proxy_filename):
|
||||
return get_proxy_from_file(proxy_filename)
|
||||
else:
|
||||
return get_proxy_from_db()
|
||||
|
||||
|
||||
def get_requests_proxy(proxies):
|
||||
return {'http': 'http://' + proxies, 'https': 'http://' + proxies}
|
||||
|
||||
|
||||
def get_proxy_for_requests():
|
||||
ip, port = get_proxy()
|
||||
return get_requests_proxy(ip + ":" + port)
|
||||
|
||||
|
||||
|
||||
@@ -10,7 +10,10 @@ import insta.instaparser as instaparser
|
||||
import insta.instaheaders as instaheaders
|
||||
import requests
|
||||
import logging
|
||||
|
||||
# from multiprocessing import Queue
|
||||
# import multiprocessing
|
||||
from queue import Queue
|
||||
import threading
|
||||
|
||||
from base.baseclasses import SendtoDB
|
||||
from base.baseclasses import CrawlInit
|
||||
@@ -18,6 +21,7 @@ from base.baseclasses import wait
|
||||
from base.baseclasses import Browser
|
||||
from selenium.webdriver.common.keys import Keys
|
||||
from base.baseclasses import enter_element
|
||||
import base.proxy
|
||||
|
||||
|
||||
def printl(*objects, sep=' ', end='\n', file=None, flush=True):
|
||||
@@ -29,6 +33,13 @@ insta_query = "https://www.instagram.com/query/"
|
||||
insta_body_url = 'https://www.instagram.com/p/'
|
||||
|
||||
is_debuging = False
|
||||
is_debug = False
|
||||
|
||||
|
||||
def printd(*objects, sep=' ', end='\n', file=None, flush=True):
|
||||
if is_debug:
|
||||
print(*objects, sep=sep, end=end, file=file, flush=flush)
|
||||
|
||||
|
||||
num_of_list_ajax = 24
|
||||
num_of_reply_ajax = 100
|
||||
@@ -36,7 +47,9 @@ list_wait_sec = 0.9
|
||||
body_wait_sec = 0.5
|
||||
reply_wait_sec = 0.8
|
||||
num_of_page_down = 20
|
||||
|
||||
num_of_content_process = 10
|
||||
requests_timeout = 60
|
||||
num_of_retry_proxy = 5
|
||||
|
||||
logging.basicConfig(level=logging.INFO,
|
||||
format="%(module)s(%(lineno)s):%(funcName)s:%(message)s")
|
||||
@@ -62,6 +75,240 @@ def focus_driver(driver):
|
||||
driver.set_window_position(position['x'], position['y'])
|
||||
|
||||
|
||||
def instance_wrapper(func):
|
||||
# to save nice ip, port of proxy
|
||||
ip, port = base.proxy.get_proxy()
|
||||
|
||||
def retry_load(*args, **kwargs):
|
||||
while True:
|
||||
# use clouser
|
||||
nonlocal ip, port
|
||||
proxies = base.proxy.get_requests_proxy(ip + ":" + port)
|
||||
kwargs['proxies'] = proxies
|
||||
# retry = num_of_retry_proxy
|
||||
# while retry:
|
||||
res = func(*args, **kwargs)
|
||||
if res:
|
||||
# printl("id : {2} - connect success - {0}:{1}".format(ip, port, threading.get_ident()))
|
||||
return res
|
||||
# if the proxy was not good, get new proxy
|
||||
# printl('id : {2} - connect failed - {0}:{1}'.format(ip, port, threading.get_ident()))
|
||||
ip, port = base.proxy.get_proxy()
|
||||
# retry -= 1
|
||||
return retry_load
|
||||
|
||||
|
||||
class InstanceWrapper(object):
|
||||
def __init__(self, func):
|
||||
self.ip, self.port = base.proxy.get_proxy()
|
||||
self.func = func
|
||||
self.num_of_retry_proxy = num_of_retry_proxy
|
||||
|
||||
def do(self, *args, **kwargs):
|
||||
while True:
|
||||
proxies = base.proxy.get_requests_proxy(self.ip + ":" + self.port)
|
||||
kwargs['proxies'] = proxies
|
||||
# retry = num_of_retry_proxy
|
||||
# while retry:
|
||||
res = self.func(*args, **kwargs)
|
||||
if res:
|
||||
# printd("id : {2} - connect success - {0}:{1}".format(self.ip, self.port, threading.get_ident()))
|
||||
return res
|
||||
# if the proxy was not good, get new proxy
|
||||
# printd('id : {2} - connect failed - {0}:{1}'.format(self.ip, self.port, threading.get_ident()))
|
||||
self.ip, self.port = base.proxy.get_proxy()
|
||||
# retry -= 1
|
||||
|
||||
def do_retry(self, *args, **kwargs):
|
||||
while True:
|
||||
proxies = base.proxy.get_requests_proxy(self.ip + ":" + self.port)
|
||||
kwargs['proxies'] = proxies
|
||||
retry = self.num_of_retry_proxy
|
||||
while retry:
|
||||
res = self.func(*args, **kwargs)
|
||||
if res:
|
||||
# printl("id : {2} - connect success - {0}:{1}".format(self.ip, self.port, threading.get_ident()))
|
||||
return res
|
||||
# if the proxy was not good, get new proxy
|
||||
# printl('id : {2} - connect failed - {0}:{1}'.format(self.ip, self.port, threading.get_ident()))
|
||||
retry -= 1
|
||||
self.ip, self.port = base.proxy.get_proxy()
|
||||
|
||||
def do_no_proxy(self, *args, **kwargs):
|
||||
while True:
|
||||
retry = self.num_of_retry_proxy
|
||||
while retry:
|
||||
proxies = base.proxy.get_requests_proxy(self.ip + ":" + self.port)
|
||||
kwargs['proxies'] = proxies
|
||||
res = self.func(*args, **kwargs)
|
||||
if res:
|
||||
printd("id : {2} - connect success - {0}:{1}".format(self.ip, self.port, threading.get_ident()))
|
||||
return res
|
||||
# if the proxy was not good, get new proxy
|
||||
printd('id : {2} - connect failed - {0}:{1}'.format(self.ip, self.port, threading.get_ident()))
|
||||
retry -= 1
|
||||
self.ip, self.port = base.proxy.get_proxy()
|
||||
|
||||
# if get content with proxy failed, set no proxy
|
||||
# func guarantee returning a instance except the case where a url is invalid
|
||||
kwargs['proxies'] = None
|
||||
res = self.func(*args, **kwargs)
|
||||
# if res:
|
||||
# printl("id : {2} - connect success - {0}:{1}".format('None', 'None', threading.get_ident()))
|
||||
# printl(args, kwargs)
|
||||
printd("id : {2} - connect success - {0}:{1}".format('None', 'None', threading.get_ident()))
|
||||
return res
|
||||
|
||||
def change_proxy(self):
|
||||
self.ip, self.port = base.proxy.get_proxy()
|
||||
|
||||
|
||||
@instance_wrapper
|
||||
def make_list_instance(url, proxies=None):
|
||||
try:
|
||||
if insta_tag_url in url:
|
||||
list_crawler = ListTag(url, proxies)
|
||||
else:
|
||||
list_crawler = ListUser(url, proxies)
|
||||
return list_crawler
|
||||
except:
|
||||
return None
|
||||
|
||||
|
||||
# @instance_wrapper
|
||||
def make_content_instance(url, proxies=None):
|
||||
try:
|
||||
content = InstaContent(url, {}, url, proxies)
|
||||
return content
|
||||
except:
|
||||
return None
|
||||
|
||||
|
||||
def ajax_wrapper(func):
|
||||
def retry_ajax_load(*args, **kwargs):
|
||||
retry = num_of_retry_proxy
|
||||
while retry:
|
||||
res = func(*args, **kwargs)
|
||||
if res is not None:
|
||||
break
|
||||
retry -= 1
|
||||
return res
|
||||
return retry_ajax_load
|
||||
|
||||
|
||||
@ajax_wrapper
|
||||
def load_ajax_list(ins):
|
||||
try:
|
||||
insta_list = ins.load_more()
|
||||
# if insta_list:
|
||||
# return insta_list
|
||||
# else:
|
||||
# return None
|
||||
return insta_list
|
||||
except:
|
||||
return None
|
||||
|
||||
|
||||
@ajax_wrapper
|
||||
def load_ajax_reply(ins):
|
||||
try:
|
||||
replies = ins.load_reply_more()
|
||||
# if replies:
|
||||
# return replies
|
||||
# else:
|
||||
# return None
|
||||
return replies
|
||||
except:
|
||||
return None
|
||||
|
||||
|
||||
# def crawl_content_process(qu, keyword_id, db_num):
|
||||
# send_to_db = SendtoDB()
|
||||
# send_to_db.set_db(db_num)
|
||||
# while True:
|
||||
# element = qu.get()
|
||||
# if element is None:
|
||||
# break
|
||||
# ok = True
|
||||
# while ok:
|
||||
# try:
|
||||
# ip, port = base.proxy.get_proxy()
|
||||
# proxies = base.proxy.get_requests_proxy(ip + ":" + port)
|
||||
# content = InstaContent(element['url'], {}, element['url'], proxies)
|
||||
# body = content.get_body()
|
||||
# replies = content.get_reply()
|
||||
# body['article_url'] = element['url']
|
||||
# body['keyword_id'] = keyword_id
|
||||
# while content.has_previous:
|
||||
# replies = content.load_reply_more() + replies
|
||||
# wait(reply_wait_sec)
|
||||
# for j in range(0, len(replies)):
|
||||
# replies[j]['article_url'] = body['article_url']
|
||||
# replies[j]['platform_id'] = body['platform_id']
|
||||
# replies[j]['article_order'] = j
|
||||
# send_to_db.delete_url(body['article_url'])
|
||||
# send_to_db.send_body(body)
|
||||
# if replies:
|
||||
# send_to_db.send_reply(replies)
|
||||
# printl(element['url'])
|
||||
# printl('ok')
|
||||
# ok = False
|
||||
# except:
|
||||
# printl("failed proxy {0}:{1}".format(ip, port))
|
||||
# printl('finish thread')
|
||||
|
||||
|
||||
def crawl_content_process(qu, keyword_id, db_num):
|
||||
# m_c_i = instance_wrapper(make_content_instance)
|
||||
m_c_i = InstanceWrapper(make_content_instance)
|
||||
send_to_db = SendtoDB()
|
||||
send_to_db.set_db(db_num)
|
||||
while True:
|
||||
element = qu.get()
|
||||
if element is None:
|
||||
break
|
||||
ok = True
|
||||
while ok:
|
||||
try:
|
||||
# get a instance of InstaContent by do_no_proxy func.
|
||||
# if element['url'] is invalid, content is None
|
||||
content = m_c_i.do_no_proxy(element['url'])
|
||||
if not content:
|
||||
break
|
||||
body = content.get_body()
|
||||
replies = content.get_reply()
|
||||
body['article_url'] = element['url']
|
||||
body['keyword_id'] = keyword_id
|
||||
while content.has_previous:
|
||||
rep = load_ajax_reply(content)
|
||||
if rep is None:
|
||||
printl("proxies = ", content.proxies)
|
||||
m_c_i.change_proxy()
|
||||
raise Exception("reply load error")
|
||||
replies = rep + replies
|
||||
wait(reply_wait_sec)
|
||||
for j in range(0, len(replies)):
|
||||
replies[j]['article_url'] = body['article_url']
|
||||
replies[j]['platform_id'] = body['platform_id']
|
||||
replies[j]['article_order'] = j
|
||||
send_to_db.delete_url(body['article_url'])
|
||||
send_to_db.send_body(body)
|
||||
if replies:
|
||||
send_to_db.send_reply(replies)
|
||||
printl(element['url'])
|
||||
printl('ok')
|
||||
ok = False
|
||||
except UnicodeEncodeError as ue:
|
||||
printl(element['url'])
|
||||
printl(ue)
|
||||
break
|
||||
except Exception as e:
|
||||
# catch error when send_to_db error occur
|
||||
printl(element['url'])
|
||||
printl(e)
|
||||
printl('finish thread')
|
||||
|
||||
|
||||
class InstaInit(CrawlInit):
|
||||
def __init__(self, before_day=0):
|
||||
super().__init__(before_day)
|
||||
@@ -107,7 +354,7 @@ class InstaInit(CrawlInit):
|
||||
|
||||
|
||||
class ListTag:
|
||||
def __init__(self, url):
|
||||
def __init__(self, url, proxies=None):
|
||||
self.__r = None
|
||||
self.__tag = ''
|
||||
self.__url = ''
|
||||
@@ -115,54 +362,31 @@ class ListTag:
|
||||
self.end_cursor = None
|
||||
self.has_next = False
|
||||
self.cookies = {}
|
||||
self.load_url(url)
|
||||
|
||||
def load_url(self, url):
|
||||
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html())
|
||||
if is_debuging:
|
||||
printl("<ListTag Start>")
|
||||
printl("<ListTag requests>")
|
||||
printl('headers = ', end=' ')
|
||||
printl(instaheaders.get_headers_for_list_html())
|
||||
self.proxies = proxies
|
||||
self.load_url(url, self.proxies)
|
||||
|
||||
def load_url(self, url, proxies):
|
||||
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html(), proxies=proxies,
|
||||
timeout=requests_timeout)
|
||||
self.log_load_url_before()
|
||||
self.__r.raise_for_status()
|
||||
self.__tag = self.__get_tag(url)
|
||||
self.__set_cookies(self.__r.cookies)
|
||||
self.__url = url
|
||||
self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(self.__r.content)
|
||||
if is_debuging:
|
||||
printl("<ListTag response>")
|
||||
printl('self.__r.cookies=', end='')
|
||||
printl(self.__r.cookies)
|
||||
printl('end_cursor = ' + str(self.end_cursor))
|
||||
printl('has_next = ', end='')
|
||||
printl(self.has_next)
|
||||
printl("<ListTag End>")
|
||||
|
||||
self.log_load_url_after()
|
||||
return self.list_tag
|
||||
|
||||
def load_more(self):
|
||||
form_data = instaheaders.get_form_data_for_list_tag(self.__tag, self.end_cursor, num_of_list_ajax)
|
||||
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data)
|
||||
if is_debuging:
|
||||
printl("<ListTag Start>")
|
||||
printl("<ListTag requests>")
|
||||
printl('end_cursor = ' + str(self.end_cursor))
|
||||
printl('form_data' + form_data)
|
||||
printl('headers = ', end=' ')
|
||||
printl(headers)
|
||||
self.__r = requests.post(insta_query, headers=headers, data=form_data)
|
||||
self.log_load_more_before(form_data, headers)
|
||||
self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies,
|
||||
timeout=requests_timeout)
|
||||
self.__set_cookies(self.__r.cookies)
|
||||
self.__r.raise_for_status()
|
||||
self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content)
|
||||
if is_debuging:
|
||||
printl("<ListTag response>")
|
||||
printl('self.__r.cookies=', end='')
|
||||
printl(self.__r.cookies)
|
||||
printl('end_cursor = ' + str(self.end_cursor))
|
||||
printl('has_next = ', end='')
|
||||
printl(self.has_next)
|
||||
printl("<ListTag End>")
|
||||
self.log_load_more_after()
|
||||
return self.list_tag
|
||||
|
||||
def __get_tag(self, url):
|
||||
@@ -191,9 +415,52 @@ class ListTag:
|
||||
def get_list(self):
|
||||
return self.list_tag
|
||||
|
||||
def get_proxy(self):
|
||||
return self.proxies
|
||||
|
||||
def log_load_url_before(self):
|
||||
if is_debuging:
|
||||
printl("<ListTag Start>")
|
||||
printl("<ListTag requests>")
|
||||
printl('headers = ', end=' ')
|
||||
printl(instaheaders.get_headers_for_list_html())
|
||||
|
||||
def log_load_url_after(self):
|
||||
if is_debuging:
|
||||
printl("<ListTag response>")
|
||||
printl('self.__r.cookies=', end='')
|
||||
printl(self.__r.cookies)
|
||||
printl('end_cursor = ' + str(self.end_cursor))
|
||||
printl('has_next = ', end='')
|
||||
printl(self.has_next)
|
||||
printl('proxies = ', end='')
|
||||
printl(self.proxies)
|
||||
printl("<ListTag End>")
|
||||
|
||||
def log_load_more_before(self, form_data, headers):
|
||||
if is_debuging:
|
||||
printl("<ListTag Start>")
|
||||
printl("<ListTag requests>")
|
||||
printl('end_cursor = ' + str(self.end_cursor))
|
||||
printl('form_data' + form_data)
|
||||
printl('headers = ', end=' ')
|
||||
printl(headers)
|
||||
|
||||
def log_load_more_after(self):
|
||||
if is_debuging:
|
||||
printl("<ListTag response>")
|
||||
printl('self.__r.cookies=', end='')
|
||||
printl(self.__r.cookies)
|
||||
printl('end_cursor = ' + str(self.end_cursor))
|
||||
printl('has_next = ', end='')
|
||||
printl(self.has_next)
|
||||
printl('proxies = ', end='')
|
||||
printl(self.proxies)
|
||||
printl("<ListTag End>")
|
||||
|
||||
|
||||
class ListUser:
|
||||
def __init__(self, url):
|
||||
def __init__(self, url, proxies=None):
|
||||
self.__r = None
|
||||
self.__user = ''
|
||||
self.__url = ''
|
||||
@@ -201,10 +468,12 @@ class ListUser:
|
||||
self.end_cursor = None
|
||||
self.has_next = False
|
||||
self.cookies = {}
|
||||
self.load_url(url)
|
||||
self.proxies = proxies
|
||||
self.load_url(url, self.proxies)
|
||||
|
||||
def load_url(self, url):
|
||||
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html())
|
||||
def load_url(self, url, proxies):
|
||||
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html(), proxies=proxies,
|
||||
timeout=requests_timeout)
|
||||
self.__r.raise_for_status()
|
||||
self.__url = url
|
||||
self.__set_cookies(self.__r.cookies)
|
||||
@@ -214,26 +483,14 @@ class ListUser:
|
||||
def load_more(self):
|
||||
form_data = instaheaders.get_form_data_for_list_user(self.__user, self.end_cursor, num_of_list_ajax)
|
||||
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data)
|
||||
if is_debuging:
|
||||
printl("<ListUser Start>")
|
||||
printl("<ListUser requests>")
|
||||
printl('end_cursor = ' + str(self.end_cursor))
|
||||
printl('form_data' + form_data)
|
||||
printl('headers = ', end=' ')
|
||||
printl(headers)
|
||||
self.__r = requests.post(insta_query, headers=headers, data=form_data)
|
||||
self.log_load_more_before(form_data, headers)
|
||||
self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies,
|
||||
timeout=requests_timeout)
|
||||
self.__r.raise_for_status()
|
||||
self.__set_cookies(self.__r.cookies)
|
||||
if is_debuging:
|
||||
printl("<ListUser response>")
|
||||
printl('self.__r.cookies=', end='')
|
||||
printl(self.__r.cookies)
|
||||
printl('end_cursor = ' + str(self.end_cursor))
|
||||
printl('has_next = ', end='')
|
||||
printl(self.has_next)
|
||||
printl("<ListUser End>")
|
||||
|
||||
self.list_user, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content)
|
||||
self.log_load_more_after()
|
||||
return self.list_user
|
||||
|
||||
def get_cookies(self):
|
||||
@@ -255,9 +512,33 @@ class ListUser:
|
||||
def get_list(self):
|
||||
return self.list_user
|
||||
|
||||
def get_proxy(self):
|
||||
return self.proxies
|
||||
|
||||
def log_load_more_before(self, form_data, headers):
|
||||
if is_debuging:
|
||||
printl("<ListUser Start>")
|
||||
printl("<ListUser requests>")
|
||||
printl('end_cursor = ' + str(self.end_cursor))
|
||||
printl('form_data' + form_data)
|
||||
printl('headers = ', end=' ')
|
||||
printl(headers)
|
||||
|
||||
def log_load_more_after(self):
|
||||
if is_debuging:
|
||||
printl("<ListUser response>")
|
||||
printl('self.__r.cookies=', end='')
|
||||
printl(self.__r.cookies)
|
||||
printl('end_cursor = ' + str(self.end_cursor))
|
||||
printl('has_next = ', end='')
|
||||
printl(self.has_next)
|
||||
printl('proxies = ', end='')
|
||||
printl(self.proxies)
|
||||
printl("<ListUser End>")
|
||||
|
||||
|
||||
class InstaContent:
|
||||
def __init__(self, url, cookies, referer):
|
||||
def __init__(self, url, cookies, referer, proxies=None):
|
||||
self.__r = None
|
||||
self.__referer = ''
|
||||
self.__code = ''
|
||||
@@ -266,11 +547,13 @@ class InstaContent:
|
||||
self.start_cursor = None
|
||||
self.has_previous = False
|
||||
self.cookies = {}
|
||||
self.load_url(url, cookies, referer)
|
||||
self.proxies = proxies
|
||||
self.load_url(url, cookies, referer, self.proxies)
|
||||
|
||||
def load_url(self, url, cookies, referer):
|
||||
def load_url(self, url, cookies, referer, proxies):
|
||||
self.__set_cookies(cookies)
|
||||
self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies))
|
||||
self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies), proxies=proxies,
|
||||
timeout=requests_timeout)
|
||||
self.__r.raise_for_status()
|
||||
self.__referer = referer
|
||||
self.__code = self.__get_code(url)
|
||||
@@ -287,25 +570,13 @@ class InstaContent:
|
||||
def load_reply_more(self):
|
||||
form_data = instaheaders.get_form_data_for_reply(self.__code, self.start_cursor, num_of_reply_ajax)
|
||||
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__referer, form_data)
|
||||
if is_debuging:
|
||||
printl("<ContentReply Start>")
|
||||
printl("<ContentReply requests>")
|
||||
printl('start_cursor = ' + self.start_cursor)
|
||||
printl('form_data' + form_data)
|
||||
printl('headers = ', end=' ')
|
||||
printl(headers)
|
||||
self.__r = requests.post(insta_query, headers=headers, data=form_data)
|
||||
self.log_load_reply_more_before(form_data, headers)
|
||||
self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies,
|
||||
timeout=requests_timeout)
|
||||
self.__r.raise_for_status()
|
||||
self.__set_cookies(self.__r.cookies)
|
||||
self.reply, self.start_cursor, self.has_previous = instaparser.parse_reply_ajax(self.__r.content)
|
||||
if is_debuging:
|
||||
printl("<ContentReply response>")
|
||||
printl('self.__r.cookies=', end='')
|
||||
printl(self.__r.cookies)
|
||||
printl('start_cursor = ' + str(self.start_cursor))
|
||||
printl('has_previous = ', end='')
|
||||
printl(self.has_previous)
|
||||
printl("<ContentReply End>")
|
||||
self.log_load_reply_more_after()
|
||||
return self.reply
|
||||
|
||||
def get_cookies(self):
|
||||
@@ -322,6 +593,30 @@ class InstaContent:
|
||||
for k, v in cookies.items():
|
||||
self.cookies[k] = v
|
||||
|
||||
def get_proxy(self):
|
||||
return self.proxies
|
||||
|
||||
def log_load_reply_more_before(self, form_data, headers):
|
||||
if is_debuging:
|
||||
printl("<ContentReply Start>")
|
||||
printl("<ContentReply requests>")
|
||||
printl('start_cursor = ' + self.start_cursor)
|
||||
printl('form_data' + form_data)
|
||||
printl('headers = ', end=' ')
|
||||
printl(headers)
|
||||
|
||||
def log_load_reply_more_after(self):
|
||||
if is_debuging:
|
||||
printl("<ContentReply response>")
|
||||
printl('self.__r.cookies=', end='')
|
||||
printl(self.__r.cookies)
|
||||
printl('start_cursor = ' + str(self.start_cursor))
|
||||
printl('has_previous = ', end='')
|
||||
printl(self.has_previous)
|
||||
printl('proxies = ', end='')
|
||||
printl(self.proxies)
|
||||
printl("<ContentReply End>")
|
||||
|
||||
|
||||
class InstaAlgorithm:
|
||||
def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id,
|
||||
@@ -474,6 +769,123 @@ class InstaAlgorithmNormal(InstaAlgorithm):
|
||||
printl("Finished Crawling :)")
|
||||
|
||||
|
||||
class InstaAlgorithmMulti(InstaAlgorithm):
|
||||
def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id,
|
||||
reload_wait_second=2, num_of_load_content=12, page_down=50):
|
||||
super().__init__(driver, browser, crawl_init, send_to_db, keyword_id,
|
||||
reload_wait_second, num_of_load_content, page_down)
|
||||
if self.driver:
|
||||
self.driver.quit()
|
||||
self.list_crawl = Queue()
|
||||
self.total_num = 0
|
||||
|
||||
def crawl_contents(self, contents_list, backup_set):
|
||||
"""
|
||||
:param contents_list:
|
||||
:param backup_set:
|
||||
:return: is_load_more
|
||||
"""
|
||||
old_elements = 0
|
||||
for element in contents_list:
|
||||
if element['date'].date() > self.crawl_init.get_end_day():
|
||||
# printl(element['url'])
|
||||
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
|
||||
|
||||
elif element['date'].date() < self.crawl_init.get_begin_day():
|
||||
printl(element['url'])
|
||||
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
|
||||
old_elements += 1
|
||||
if old_elements > 6:
|
||||
return False
|
||||
else:
|
||||
if not element['url'] in backup_set:
|
||||
# printl(element['url'])
|
||||
# printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
|
||||
# wait(1.5)
|
||||
# self.crawl_content(element['url'], list_crawler.get_cookies(), list_crawler.get_url())
|
||||
self.list_crawl.put(element)
|
||||
backup_set.add(element['url'])
|
||||
self.total_num += 1
|
||||
if self.is_until_page():
|
||||
return False
|
||||
# if self.list_crawl:
|
||||
# printl("Number of Lists = {0}".format(len(self.list_crawl)))
|
||||
return True
|
||||
|
||||
def crawl(self):
|
||||
real_time = True
|
||||
while real_time:
|
||||
printl("Crawling Start")
|
||||
url_list = self.crawl_init.make_url()
|
||||
i = 0
|
||||
end_cursor = None
|
||||
backup_set = set()
|
||||
while i < len(url_list):
|
||||
# first connect
|
||||
try:
|
||||
printl(url_list[i] + "\n")
|
||||
# insta_content process create and start
|
||||
# p_list = [multiprocessing.Process(target=crawl_content_process,
|
||||
# args=(self.list_crawl, self.keyword_id, self.send_to_db.db_num))
|
||||
# for i in range(num_of_content_process)]
|
||||
p_list = [threading.Thread(target=crawl_content_process,
|
||||
args=(self.list_crawl, self.keyword_id, self.send_to_db.db_num))
|
||||
for i in range(num_of_content_process)]
|
||||
for p in p_list:
|
||||
p.daemon = True
|
||||
p.start()
|
||||
|
||||
# crawl list
|
||||
ok = True
|
||||
while ok:
|
||||
try:
|
||||
list_crawler = make_list_instance(url_list[i])
|
||||
ok = False
|
||||
except Exception as e:
|
||||
printl(e)
|
||||
wait(1)
|
||||
insta_list = list_crawler.get_list()
|
||||
is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next
|
||||
# ajax load
|
||||
while is_load_more:
|
||||
if end_cursor:
|
||||
list_crawler.end_cursor = end_cursor
|
||||
end_cursor = None
|
||||
wait(self.reload_wait_second)
|
||||
try:
|
||||
insta_list = load_ajax_list(list_crawler)
|
||||
if insta_list is None:
|
||||
break
|
||||
is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next
|
||||
except Exception as e:
|
||||
printl('is_load_more exception')
|
||||
printl(e)
|
||||
is_load_more = False
|
||||
#self.crawl_list()
|
||||
#self.list_crawl.close()
|
||||
printl("end load")
|
||||
printl("total number of crawled list = {0}".format(self.total_num))
|
||||
self.total_num = 0
|
||||
|
||||
# stop child process
|
||||
for i in range(num_of_content_process):
|
||||
self.list_crawl.put(None)
|
||||
|
||||
# wait child process
|
||||
for p in p_list:
|
||||
p.join()
|
||||
|
||||
i += 1
|
||||
except Exception as e:
|
||||
logging.info(e)
|
||||
end_cursor = list_crawler.end_cursor
|
||||
printl('end_cursor=' + end_cursor)
|
||||
if e.args:
|
||||
wait(300)
|
||||
real_time = self.crawl_init.is_realtime()
|
||||
printl("Finished Crawling :)")
|
||||
|
||||
|
||||
class InstaAlgorithmBrowser(InstaAlgorithm):
|
||||
def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id,
|
||||
reload_wait_second=2, num_of_load_content=12, page_down=50):
|
||||
@@ -548,7 +960,8 @@ class InstaMainCrawler:
|
||||
def __init__(self):
|
||||
self.send_to_db = SendtoDB()
|
||||
self.crawl_init = InstaInit()
|
||||
self.browser = Browser()
|
||||
# self.browser = Browser()
|
||||
self.browser = None
|
||||
self.driver = None
|
||||
|
||||
def set_keyword_id(self, keyword_id):
|
||||
@@ -565,7 +978,7 @@ class InstaMainCrawler:
|
||||
self.init_db(db_num)
|
||||
self.init_before_day(before_day)
|
||||
self.init_until_page(until_page)
|
||||
self.init_browser(browser)
|
||||
# self.init_browser(browser)
|
||||
|
||||
def set_driver(self, driver):
|
||||
self.driver = driver
|
||||
@@ -594,10 +1007,12 @@ class InstaMainCrawler:
|
||||
self.crawl_init.set_until_page(until_page)
|
||||
|
||||
def crawler_start(self):
|
||||
if self.driver:
|
||||
algorithm = InstaAlgorithmBrowser(self.driver, self.browser, self.crawl_init, self.send_to_db,
|
||||
self.keyword_id, list_wait_sec, num_of_list_ajax, num_of_page_down)
|
||||
else:
|
||||
algorithm = InstaAlgorithmNormal(self.driver, self.browser, self.crawl_init, self.send_to_db,
|
||||
self.keyword_id, list_wait_sec, num_of_list_ajax, num_of_page_down)
|
||||
# if self.driver:
|
||||
# algorithm = InstaAlgorithmBrowser(self.driver, self.browser, self.crawl_init, self.send_to_db,
|
||||
# self.keyword_id, list_wait_sec, num_of_list_ajax, num_of_page_down)
|
||||
# else:
|
||||
# algorithm = InstaAlgorithmNormal(self.driver, self.browser, self.crawl_init, self.send_to_db,
|
||||
# self.keyword_id, list_wait_sec, num_of_list_ajax, num_of_page_down)
|
||||
algorithm = InstaAlgorithmMulti(self.driver, self.browser, self.crawl_init, self.send_to_db,
|
||||
self.keyword_id, list_wait_sec, num_of_list_ajax, num_of_page_down)
|
||||
algorithm.start_crawl()
|
||||
|
||||
603
WebBasedCrawler/insta/instacrawl_backup3.py
Normal file
603
WebBasedCrawler/insta/instacrawl_backup3.py
Normal file
@@ -0,0 +1,603 @@
|
||||
#-*- coding: utf-8 -*-
|
||||
'''
|
||||
Created on 2015. 12. 8.
|
||||
|
||||
@author: cococo
|
||||
'''
|
||||
import re
|
||||
import datetime
|
||||
import insta.instaparser as instaparser
|
||||
import insta.instaheaders as instaheaders
|
||||
import requests
|
||||
import logging
|
||||
|
||||
|
||||
from base.baseclasses import SendtoDB
|
||||
from base.baseclasses import CrawlInit
|
||||
from base.baseclasses import wait
|
||||
from base.baseclasses import Browser
|
||||
from selenium.webdriver.common.keys import Keys
|
||||
from base.baseclasses import enter_element
|
||||
|
||||
|
||||
def printl(*objects, sep=' ', end='\n', file=None, flush=True):
|
||||
print(*objects, sep=sep, end=end, file=file, flush=flush)
|
||||
|
||||
insta_url = "https://www.instagram.com/"
|
||||
insta_tag_url = "https://www.instagram.com/explore/tags/"
|
||||
insta_query = "https://www.instagram.com/query/"
|
||||
insta_body_url = 'https://www.instagram.com/p/'
|
||||
|
||||
is_debuging = False
|
||||
|
||||
num_of_list_ajax = 24
|
||||
num_of_reply_ajax = 100
|
||||
list_wait_sec = 0.9
|
||||
body_wait_sec = 0.5
|
||||
reply_wait_sec = 0.8
|
||||
num_of_page_down = 20
|
||||
|
||||
|
||||
logging.basicConfig(level=logging.INFO,
|
||||
format="%(module)s(%(lineno)s):%(funcName)s:%(message)s")
|
||||
logging.getLogger('requests').setLevel(logging.WARNING)
|
||||
logging.getLogger('pymysql').setLevel(logging.WARNING)
|
||||
|
||||
|
||||
def click_insta_load_more(driver):
|
||||
element = driver.find_element_by_css_selector("div._pupj3 > a")
|
||||
enter_element(element)
|
||||
|
||||
|
||||
def push_page_down(driver):
|
||||
body = driver.find_element_by_tag_name('body')
|
||||
body.send_keys(Keys.PAGE_DOWN)
|
||||
|
||||
|
||||
def focus_driver(driver):
|
||||
position = driver.get_window_position()
|
||||
size = driver.get_window_size()
|
||||
driver.maximize_window()
|
||||
driver.set_window_size(size['width'], size["height"])
|
||||
driver.set_window_position(position['x'], position['y'])
|
||||
|
||||
|
||||
class InstaInit(CrawlInit):
|
||||
def __init__(self, before_day=0):
|
||||
super().__init__(before_day)
|
||||
self.urls = dict()
|
||||
self.urls[9] = insta_tag_url
|
||||
self.urls[10] = insta_url
|
||||
|
||||
def split_searches(self):
|
||||
search = self.searches()
|
||||
splited_list = search.split(',')
|
||||
trimmed_list = list()
|
||||
if self.platform() == 10:
|
||||
for x in splited_list:
|
||||
trimmed_list.append(x.strip())
|
||||
else:
|
||||
for x in splited_list:
|
||||
trimmed_list.append(self.utf8(x))
|
||||
return trimmed_list
|
||||
|
||||
def make_url(self):
|
||||
urls = list()
|
||||
for x in self.split_searches():
|
||||
url = self.urls[self.platform()] + x
|
||||
urls.append(url)
|
||||
return urls
|
||||
|
||||
def get_begin_day(self):
|
||||
if self.is_realtime():
|
||||
date_now = datetime.datetime.now()
|
||||
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
|
||||
result += datetime.timedelta(days=self.before_day)
|
||||
return result.date()
|
||||
else:
|
||||
return self.start_day()
|
||||
|
||||
def get_end_day(self):
|
||||
if self.is_realtime():
|
||||
date_now = datetime.datetime.now()
|
||||
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
|
||||
return result.date()
|
||||
else:
|
||||
return self.end_day()
|
||||
|
||||
|
||||
class ListTag:
|
||||
def __init__(self, url):
|
||||
self.__r = None
|
||||
self.__tag = ''
|
||||
self.__url = ''
|
||||
self.list_tag = []
|
||||
self.end_cursor = None
|
||||
self.has_next = False
|
||||
self.cookies = {}
|
||||
self.load_url(url)
|
||||
|
||||
def load_url(self, url):
|
||||
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html())
|
||||
if is_debuging:
|
||||
printl("<ListTag Start>")
|
||||
printl("<ListTag requests>")
|
||||
printl('headers = ', end=' ')
|
||||
printl(instaheaders.get_headers_for_list_html())
|
||||
|
||||
self.__r.raise_for_status()
|
||||
self.__tag = self.__get_tag(url)
|
||||
self.__set_cookies(self.__r.cookies)
|
||||
self.__url = url
|
||||
self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(self.__r.content)
|
||||
if is_debuging:
|
||||
printl("<ListTag response>")
|
||||
printl('self.__r.cookies=', end='')
|
||||
printl(self.__r.cookies)
|
||||
printl('end_cursor = ' + str(self.end_cursor))
|
||||
printl('has_next = ', end='')
|
||||
printl(self.has_next)
|
||||
printl("<ListTag End>")
|
||||
|
||||
return self.list_tag
|
||||
|
||||
def load_more(self):
|
||||
form_data = instaheaders.get_form_data_for_list_tag(self.__tag, self.end_cursor, num_of_list_ajax)
|
||||
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data)
|
||||
if is_debuging:
|
||||
printl("<ListTag Start>")
|
||||
printl("<ListTag requests>")
|
||||
printl('end_cursor = ' + str(self.end_cursor))
|
||||
printl('form_data' + form_data)
|
||||
printl('headers = ', end=' ')
|
||||
printl(headers)
|
||||
self.__r = requests.post(insta_query, headers=headers, data=form_data)
|
||||
self.__set_cookies(self.__r.cookies)
|
||||
self.__r.raise_for_status()
|
||||
self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content)
|
||||
if is_debuging:
|
||||
printl("<ListTag response>")
|
||||
printl('self.__r.cookies=', end='')
|
||||
printl(self.__r.cookies)
|
||||
printl('end_cursor = ' + str(self.end_cursor))
|
||||
printl('has_next = ', end='')
|
||||
printl(self.has_next)
|
||||
printl("<ListTag End>")
|
||||
return self.list_tag
|
||||
|
||||
def __get_tag(self, url):
|
||||
m = re.search(insta_tag_url + "([^/]*)", url)
|
||||
if m:
|
||||
return m.group(1)
|
||||
else:
|
||||
raise RuntimeError('Tag Error')
|
||||
|
||||
def get_cookies(self):
|
||||
return self.cookies
|
||||
|
||||
def get_url(self):
|
||||
return self.__url
|
||||
|
||||
def set_end_cursor(self, cursor):
|
||||
self.end_cursor = cursor
|
||||
|
||||
def get_end_cursor(self):
|
||||
return self.end_cursor
|
||||
|
||||
def __set_cookies(self, cookies):
|
||||
for k, v in cookies.items():
|
||||
self.cookies[k] = v
|
||||
|
||||
def get_list(self):
|
||||
return self.list_tag
|
||||
|
||||
|
||||
class ListUser:
|
||||
def __init__(self, url):
|
||||
self.__r = None
|
||||
self.__user = ''
|
||||
self.__url = ''
|
||||
self.list_user = []
|
||||
self.end_cursor = None
|
||||
self.has_next = False
|
||||
self.cookies = {}
|
||||
self.load_url(url)
|
||||
|
||||
def load_url(self, url):
|
||||
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html())
|
||||
self.__r.raise_for_status()
|
||||
self.__url = url
|
||||
self.__set_cookies(self.__r.cookies)
|
||||
self.list_user, self.end_cursor, self.has_next, self.__user = instaparser.parse_list_user_html(self.__r.content)
|
||||
return self.list_user
|
||||
|
||||
def load_more(self):
|
||||
form_data = instaheaders.get_form_data_for_list_user(self.__user, self.end_cursor, num_of_list_ajax)
|
||||
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data)
|
||||
if is_debuging:
|
||||
printl("<ListUser Start>")
|
||||
printl("<ListUser requests>")
|
||||
printl('end_cursor = ' + str(self.end_cursor))
|
||||
printl('form_data' + form_data)
|
||||
printl('headers = ', end=' ')
|
||||
printl(headers)
|
||||
self.__r = requests.post(insta_query, headers=headers, data=form_data)
|
||||
self.__r.raise_for_status()
|
||||
self.__set_cookies(self.__r.cookies)
|
||||
if is_debuging:
|
||||
printl("<ListUser response>")
|
||||
printl('self.__r.cookies=', end='')
|
||||
printl(self.__r.cookies)
|
||||
printl('end_cursor = ' + str(self.end_cursor))
|
||||
printl('has_next = ', end='')
|
||||
printl(self.has_next)
|
||||
printl("<ListUser End>")
|
||||
|
||||
self.list_user, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content)
|
||||
return self.list_user
|
||||
|
||||
def get_cookies(self):
|
||||
return self.cookies
|
||||
|
||||
def get_url(self):
|
||||
return self.__url
|
||||
|
||||
def set_end_cursor(self, cursor):
|
||||
self.end_cursor = cursor
|
||||
|
||||
def get_end_cursor(self):
|
||||
return self.end_cursor
|
||||
|
||||
def __set_cookies(self, cookies):
|
||||
for k, v in cookies.items():
|
||||
self.cookies[k] = v
|
||||
|
||||
def get_list(self):
|
||||
return self.list_user
|
||||
|
||||
|
||||
class InstaContent:
|
||||
def __init__(self, url, cookies, referer):
|
||||
self.__r = None
|
||||
self.__referer = ''
|
||||
self.__code = ''
|
||||
self.body = None
|
||||
self.reply = []
|
||||
self.start_cursor = None
|
||||
self.has_previous = False
|
||||
self.cookies = {}
|
||||
self.load_url(url, cookies, referer)
|
||||
|
||||
def load_url(self, url, cookies, referer):
|
||||
self.__set_cookies(cookies)
|
||||
self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies))
|
||||
self.__r.raise_for_status()
|
||||
self.__referer = referer
|
||||
self.__code = self.__get_code(url)
|
||||
self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(self.__r.content)
|
||||
self.__set_cookies(self.__r.cookies)
|
||||
return self.body, self.reply
|
||||
|
||||
def get_body(self):
|
||||
return self.body
|
||||
|
||||
def get_reply(self):
|
||||
return self.reply
|
||||
|
||||
def load_reply_more(self):
|
||||
form_data = instaheaders.get_form_data_for_reply(self.__code, self.start_cursor, num_of_reply_ajax)
|
||||
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__referer, form_data)
|
||||
if is_debuging:
|
||||
printl("<ContentReply Start>")
|
||||
printl("<ContentReply requests>")
|
||||
printl('start_cursor = ' + self.start_cursor)
|
||||
printl('form_data' + form_data)
|
||||
printl('headers = ', end=' ')
|
||||
printl(headers)
|
||||
self.__r = requests.post(insta_query, headers=headers, data=form_data)
|
||||
self.__r.raise_for_status()
|
||||
self.__set_cookies(self.__r.cookies)
|
||||
self.reply, self.start_cursor, self.has_previous = instaparser.parse_reply_ajax(self.__r.content)
|
||||
if is_debuging:
|
||||
printl("<ContentReply response>")
|
||||
printl('self.__r.cookies=', end='')
|
||||
printl(self.__r.cookies)
|
||||
printl('start_cursor = ' + str(self.start_cursor))
|
||||
printl('has_previous = ', end='')
|
||||
printl(self.has_previous)
|
||||
printl("<ContentReply End>")
|
||||
return self.reply
|
||||
|
||||
def get_cookies(self):
|
||||
return self.cookies
|
||||
|
||||
def __get_code(self, url):
|
||||
m = re.search(insta_body_url + "([^/]*)", url)
|
||||
if m:
|
||||
return m.group(1)
|
||||
else:
|
||||
raise RuntimeError('Tag Error')
|
||||
|
||||
def __set_cookies(self, cookies):
|
||||
for k, v in cookies.items():
|
||||
self.cookies[k] = v
|
||||
|
||||
|
||||
class InstaAlgorithm:
|
||||
def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id,
|
||||
reload_wait_second=2, num_of_load_content=12, page_down=50):
|
||||
self.send_to_db = send_to_db
|
||||
self.crawl_init = crawl_init
|
||||
self.browser = browser
|
||||
self.driver = driver
|
||||
self.keyword_id = keyword_id
|
||||
self.reload_wait_second = reload_wait_second
|
||||
self.num_of_load_content = num_of_load_content
|
||||
self.page_down = page_down
|
||||
self.list_crawl = []
|
||||
|
||||
def crawl_content(self, url, cookies, referer):
|
||||
content = InstaContent(url, cookies, referer)
|
||||
body = content.get_body()
|
||||
replies = content.get_reply()
|
||||
body['article_url'] = url
|
||||
body['keyword_id'] = self.keyword_id
|
||||
# printl(body['article_url'])
|
||||
while content.has_previous:
|
||||
replies = content.load_reply_more() + replies
|
||||
wait(reply_wait_sec)
|
||||
for j in range(0, len(replies)):
|
||||
replies[j]['article_url'] = body['article_url']
|
||||
replies[j]['platform_id'] = body['platform_id']
|
||||
replies[j]['article_order'] = j
|
||||
self.send_to_db.delete_url(body['article_url'])
|
||||
self.send_to_db.send_body(body)
|
||||
if replies:
|
||||
self.send_to_db.send_reply(replies)
|
||||
printl('ok')
|
||||
printl()
|
||||
|
||||
def start_crawl(self):
|
||||
self.crawl()
|
||||
self.close()
|
||||
|
||||
def close(self):
|
||||
if self.driver and not is_debuging:
|
||||
self.driver.quit()
|
||||
self.send_to_db.close()
|
||||
printl("Finished Crawling :)")
|
||||
|
||||
def crawl(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def is_until_page(self):
|
||||
if self.crawl_init.until_page and self.crawl_init.until_page <= len(self.list_crawl):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def crawl_contents(self, contents_list, backup_set):
|
||||
"""
|
||||
:param contents_list:
|
||||
:param backup_set:
|
||||
:return: is_load_more
|
||||
"""
|
||||
old_elements = 0
|
||||
for element in contents_list:
|
||||
if element['date'].date() > self.crawl_init.get_end_day():
|
||||
# printl(element['url'])
|
||||
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
|
||||
|
||||
elif element['date'].date() < self.crawl_init.get_begin_day():
|
||||
printl(element['url'])
|
||||
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
|
||||
old_elements += 1
|
||||
if old_elements > 6:
|
||||
return False
|
||||
else:
|
||||
if not element['url'] in backup_set:
|
||||
# printl(element['url'])
|
||||
# printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
|
||||
# wait(1.5)
|
||||
# self.crawl_content(element['url'], list_crawler.get_cookies(), list_crawler.get_url())
|
||||
self.list_crawl.append(element)
|
||||
backup_set.add(element['url'])
|
||||
if self.is_until_page():
|
||||
return False
|
||||
if self.list_crawl:
|
||||
printl("Number of Lists = {0}".format(len(self.list_crawl)))
|
||||
return True
|
||||
|
||||
def crawl_list(self):
|
||||
if self.list_crawl:
|
||||
printl()
|
||||
printl("Start Days = {0}".format(self.list_crawl[0]['date'].strftime("%Y-%m-%d %H:%M:%S")))
|
||||
printl("End Days = {0}".format(self.list_crawl[-1]['date'].strftime("%Y-%m-%d %H:%M:%S")))
|
||||
printl("Total gathered contents = {0}".format(len(self.list_crawl)))
|
||||
printl()
|
||||
for element in self.list_crawl:
|
||||
try:
|
||||
printl(element['url'])
|
||||
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
|
||||
wait(body_wait_sec)
|
||||
self.crawl_content(element['url'], {}, element['url'])
|
||||
except Exception as e:
|
||||
printl(e)
|
||||
logging.info(e)
|
||||
|
||||
|
||||
class InstaAlgorithmNormal(InstaAlgorithm):
|
||||
def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id,
|
||||
reload_wait_second=2, num_of_load_content=12, page_down=50):
|
||||
super().__init__(driver, browser, crawl_init, send_to_db, keyword_id,
|
||||
reload_wait_second, num_of_load_content, page_down)
|
||||
if self.driver:
|
||||
self.driver.quit()
|
||||
|
||||
def crawl(self):
|
||||
real_time = True
|
||||
while real_time:
|
||||
printl("Crawling Start")
|
||||
url_list = self.crawl_init.make_url()
|
||||
i = 0
|
||||
end_cursor = None
|
||||
backup_set = set()
|
||||
while i < len(url_list):
|
||||
# first connect
|
||||
try:
|
||||
printl(url_list[i] + "\n")
|
||||
if insta_tag_url in url_list[i]:
|
||||
list_crawler = ListTag(url_list[i])
|
||||
else:
|
||||
list_crawler = ListUser(url_list[i])
|
||||
wait(1)
|
||||
insta_list = list_crawler.get_list()
|
||||
is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next
|
||||
# ajax load
|
||||
while is_load_more:
|
||||
if end_cursor:
|
||||
list_crawler.end_cursor = end_cursor
|
||||
end_cursor = None
|
||||
wait(self.reload_wait_second)
|
||||
insta_list = list_crawler.load_more()
|
||||
is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next
|
||||
self.crawl_list()
|
||||
self.list_crawl.clear()
|
||||
i += 1
|
||||
except Exception as e:
|
||||
logging.info(e)
|
||||
end_cursor = list_crawler.end_cursor
|
||||
printl('end_cursor=' + end_cursor)
|
||||
if e.args:
|
||||
wait(300)
|
||||
real_time = self.crawl_init.is_realtime()
|
||||
printl("Finished Crawling :)")
|
||||
|
||||
|
||||
class InstaAlgorithmBrowser(InstaAlgorithm):
|
||||
def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id,
|
||||
reload_wait_second=2, num_of_load_content=12, page_down=50):
|
||||
super().__init__(driver, browser, crawl_init, send_to_db, keyword_id,
|
||||
reload_wait_second, num_of_load_content, page_down)
|
||||
|
||||
def url_load(self, url):
|
||||
if insta_tag_url in url:
|
||||
list_tag = ListTag(url)
|
||||
insta_list, end_cursor, has_next = instaparser.parse_list_tag_html(self.driver.page_source)
|
||||
return list_tag, insta_list, end_cursor, has_next
|
||||
else:
|
||||
list_user = ListUser(url)
|
||||
insta_list, end_cursor, has_next, user_id = instaparser.parse_list_user_html(self.driver.page_source)
|
||||
return list_user, insta_list, end_cursor, has_next
|
||||
|
||||
def crawl(self):
|
||||
real_time = True
|
||||
while real_time:
|
||||
url_list = self.crawl_init.make_url()
|
||||
i = 0
|
||||
end_cursor = None
|
||||
backup_set = set()
|
||||
while i < len(url_list):
|
||||
# first connect
|
||||
try:
|
||||
wait(3)
|
||||
printl(url_list[i] + "\n")
|
||||
self.driver.get(url_list[i])
|
||||
wait(5)
|
||||
list_crawler, insta_list, end_cursor2, has_next = self.url_load(url_list[i])
|
||||
is_load_more = self.crawl_contents(insta_list, backup_set) and has_next
|
||||
list_crawler.set_end_cursor(end_cursor2)
|
||||
list_crawler.cookies = {i['name']: i['value'] for i in self.driver.get_cookies()}
|
||||
# ajax load
|
||||
page_down = 0
|
||||
while is_load_more:
|
||||
if page_down == self.page_down:
|
||||
page_down = 0
|
||||
try:
|
||||
focus_driver(self.driver)
|
||||
click_insta_load_more(self.driver)
|
||||
except:
|
||||
push_page_down(self.driver)
|
||||
page_down += 1
|
||||
if end_cursor:
|
||||
list_crawler.end_cursor = end_cursor
|
||||
end_cursor = None
|
||||
wait(self.reload_wait_second)
|
||||
insta_list = list_crawler.load_more()
|
||||
# printl("list length = " + str(len(insta_list)))
|
||||
is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next
|
||||
# printl("number of backup_set = {0}".format(len(backup_set)))
|
||||
i += 1
|
||||
self.crawl_list()
|
||||
self.list_crawl.clear()
|
||||
except Exception as e:
|
||||
logging.info(e)
|
||||
end_cursor = list_crawler.end_cursor
|
||||
printl('end_cursor=' + end_cursor)
|
||||
if e.args:
|
||||
wait(300)
|
||||
if self.driver:
|
||||
self.driver.close()
|
||||
wait(3)
|
||||
self.driver = self.browser.new_browser()
|
||||
real_time = self.crawl_init.is_realtime()
|
||||
printl("Finished Crawling :)")
|
||||
|
||||
|
||||
class InstaMainCrawler:
|
||||
def __init__(self):
|
||||
self.send_to_db = SendtoDB()
|
||||
self.crawl_init = InstaInit()
|
||||
self.browser = Browser()
|
||||
self.driver = None
|
||||
|
||||
def set_keyword_id(self, keyword_id):
|
||||
self.keyword_id = keyword_id
|
||||
|
||||
def crawl_all(self, backup_set=None):
|
||||
pass
|
||||
|
||||
def start(self):
|
||||
self.crawler_start()
|
||||
|
||||
def set_arguments(self, browser, keyword_id, db_num, before_day, until_page):
|
||||
self.init_keyword_id(keyword_id)
|
||||
self.init_db(db_num)
|
||||
self.init_before_day(before_day)
|
||||
self.init_until_page(until_page)
|
||||
self.init_browser(browser)
|
||||
|
||||
def set_driver(self, driver):
|
||||
self.driver = driver
|
||||
|
||||
def init_browser(self, browser):
|
||||
try:
|
||||
self.set_driver(self.browser.get_new_driver(browser))
|
||||
except Exception as e:
|
||||
logging.info(e)
|
||||
|
||||
def init_keyword_id(self, keyword_id):
|
||||
if type(keyword_id) != int:
|
||||
self.keyword_id = int(keyword_id)
|
||||
else:
|
||||
self.keyword_id = keyword_id
|
||||
self.crawl_init.get_keyword_parameters(keyword_id)
|
||||
self.crawl_init.disconnect()
|
||||
|
||||
def init_db(self, db_num):
|
||||
self.send_to_db.set_db(db_num)
|
||||
|
||||
def init_before_day(self, before_day):
|
||||
self.crawl_init.set_before_day(before_day)
|
||||
|
||||
def init_until_page(self, until_page):
|
||||
self.crawl_init.set_until_page(until_page)
|
||||
|
||||
def crawler_start(self):
|
||||
if self.driver:
|
||||
algorithm = InstaAlgorithmBrowser(self.driver, self.browser, self.crawl_init, self.send_to_db,
|
||||
self.keyword_id, list_wait_sec, num_of_list_ajax, num_of_page_down)
|
||||
else:
|
||||
algorithm = InstaAlgorithmNormal(self.driver, self.browser, self.crawl_init, self.send_to_db,
|
||||
self.keyword_id, list_wait_sec, num_of_list_ajax, num_of_page_down)
|
||||
algorithm.start_crawl()
|
||||
@@ -8,14 +8,20 @@ from selenium.webdriver.common.keys import Keys
|
||||
|
||||
|
||||
def pageup_and_pagedown(_driver):
|
||||
body = _driver.find_element_by_tag_name('body')
|
||||
for i in range(0, 2):
|
||||
body.send_keys(Keys.PAGE_UP)
|
||||
wait(0.2)
|
||||
for i in range(0, 5):
|
||||
body.send_keys(Keys.PAGE_DOWN)
|
||||
wait(0.2)
|
||||
# body = _driver.find_element_by_tag_name('body')
|
||||
# for i in range(0, 2):
|
||||
# body.send_keys(Keys.PAGE_UP)
|
||||
# wait(3)
|
||||
# for i in range(0, 5):
|
||||
# body.send_keys(Keys.PAGE_DOWN)
|
||||
# wait(3)
|
||||
for i in range(0, 3):
|
||||
_driver.execute_script("window.scrollBy(0, -300)")
|
||||
wait(0.4)
|
||||
|
||||
for i in range(0, 5):
|
||||
_driver.execute_script("window.scrollBy(0, 800)")
|
||||
wait(0.4)
|
||||
|
||||
def first_load(_driver):
|
||||
element = _driver.find_element_by_css_selector("div._pupj3 > a")
|
||||
@@ -39,7 +45,7 @@ def remove_myci9(_driver):
|
||||
|
||||
|
||||
browser = Browser()
|
||||
driver = browser.get_new_driver('chrome')
|
||||
driver = browser.get_new_driver('ie')
|
||||
|
||||
url_sets = set()
|
||||
wait(5)
|
||||
@@ -53,9 +59,9 @@ wait(5)
|
||||
first_load(driver)
|
||||
wait(3)
|
||||
|
||||
print(driver.get_cookies())
|
||||
#print(driver.get_cookies())
|
||||
|
||||
with open("c:\\data\\instajumma.txt", 'w') as f:
|
||||
with open("c:\\data\\instajummaie.txt", 'w') as f:
|
||||
try:
|
||||
while True:
|
||||
for j in range(0, 10):
|
||||
|
||||
Reference in New Issue
Block a user