instagram 멀티로 실행하게 만들기

git-svn-id: svn://192.168.0.12/source@287 8346c931-da38-4b9b-9d4c-e48b93cbd075
This commit is contained in:
admin
2016-08-18 07:44:21 +00:00
parent 4374ded578
commit cff46799eb
5 changed files with 1280 additions and 103 deletions

View File

@@ -10,6 +10,9 @@ import os
import psutil
import threading
import re
import pymysql
import random
from time import localtime, strftime
from selenium import webdriver
@@ -19,6 +22,7 @@ from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
def print_and_flush(string):
print(string)
sys.stdout.flush()
@@ -77,7 +81,6 @@ def find_elements_by_xpath(driver, tag, time=0):
)
return elements
class Browser:
def __init__(self, driver=None):
self.driver = driver
@@ -120,7 +123,8 @@ class Browser:
self.chrome_basename = os.path.basename(driver_exec)
if self.is_server_executed(self.chrome_basename):
port = self.port(self.chrome_basename)
self.driver = webdriver.Remote(command_executor="http://127.0.0.1:"+port, desired_capabilities=webdriver.DesiredCapabilities.CHROME)
self.driver = webdriver.Remote(command_executor="http://127.0.0.1:"+port,
desired_capabilities=webdriver.DesiredCapabilities.CHROME)
else:
self.driver = webdriver.Chrome(self.chrome_driver_path)
return self.driver
@@ -151,9 +155,11 @@ class Browser:
self.opera_basename = os.path.basename(driver_exec)
if self.is_server_executed(self.opera_basename):
port = self.port(self.opera_basename)
self.driver = webdriver.Remote(command_executor="http://127.0.0.1:"+port, desired_capabilities=webdriver.DesiredCapabilities.OPERA)
self.driver = webdriver.Remote(command_executor="http://127.0.0.1:"+port,
desired_capabilities=webdriver.DesiredCapabilities.OPERA)
else:
self.driver = webdriver.Opera(desired_capabilities=webdriver.DesiredCapabilities.OPERA, executable_path=self.opera_driver_path)
self.driver = webdriver.Opera(desired_capabilities=webdriver.DesiredCapabilities.OPERA,
executable_path=self.opera_driver_path)
return self.driver
def driver(self):
@@ -244,9 +250,9 @@ class SendtoDB:
if e.args[0] == 2013 or e.args[0] == 2006 or e.args[0] == 10054: # Lost connection to server
print("connection lost. try to reconnection")
self.conn = self.pymysql.connect(host='bigbird.iptime.org',
user='admin', passwd='admin123',
db='concepters', charset='utf8',
cursorclass=self.pymysql.cursors.DictCursor)
user='admin', passwd='admin123',
db='concepters', charset='utf8',
cursorclass=self.pymysql.cursors.DictCursor)
with self.conn.cursor() as cursor:
cursor.execute(query)
self.conn.commit()
@@ -319,7 +325,7 @@ class CrawlInit:
pymysql = __import__('pymysql.cursors')
def __init__(self, before_day=0):
self.conn = self.pymysql.connect(host ='bigbird.iptime.org',
self.conn = self.pymysql.connect(host='bigbird.iptime.org',
user='admin', passwd='admin123',
db='concepters', charset='utf8',
cursorclass=self.pymysql.cursors.DictCursor)

View File

@@ -0,0 +1,147 @@
import re
import random
import pymysql
import os
from selenium import webdriver
import sys
proxy_filename = 'proxy.txt'
re_ip = re.compile('([\d]{1,3}\.[\d]{1,3}\.[\d]{1,3}\.[\d]{1,3})[^\d]([\d]{2,5})')
random.seed()
linux_driver_path = {
'chrome': 'chromedriver',
'opera': 'operadriver',
'firefox': None,
'ie': None
}
window_driver_path = {
'firefox': None,
'chrome': 'chromedriver.exe',
'ie': 'IEDriverServer.exe',
'opera': 'operadriver.exe'
}
driver_path = window_driver_path if sys.platform == 'win32' else linux_driver_path
platform_desired_capabilities = {
'firefox': webdriver.DesiredCapabilities.FIREFOX,
'ie': webdriver.DesiredCapabilities.INTERNETEXPLORER,
'opera': webdriver.DesiredCapabilities.OPERA,
'chrome': webdriver.DesiredCapabilities.CHROME
}
platform_webdriver = {
'firefox': webdriver.Firefox,
'chrome': webdriver.Chrome,
'ie': webdriver.Ie,
'opera': webdriver.Opera
}
# pl_webdriver = {
# 'firefox': {
# 'path': None,
# 'desired_capabilities': webdriver.DesiredCapabilities.FIREFOX,
# 'webdriver': webdriver.Firefox
# },
# 'chrome': {
# 'path': 'chromedriver.exe' if sys.platform == 'win32' else 'chromedriver',
# 'desired_capabilities': webdriver.DesiredCapabilities.CHROME,
# 'webdriver': webdriver.Chrome
# },
# 'ie': {
# 'path': 'IEDriverServer.exe' if sys.platform == 'win32' else None,
# 'desired_capabilities': webdriver.DesiredCapabilities.INTERNETEXPLORER,
# 'webdriver': webdriver.Ie
# },
# 'opera': {
# 'path': 'operadriver.exe' if sys.platform == 'win32' else 'operadriver',
# 'desired_capabilities': webdriver.DesiredCapabilities.OPERA,
# 'webdriver': webdriver.Opera
# }
# }
def get_driver(platform, proxies):
"""
:param platform: 'chrome', 'ie', 'opera', 'firefox'
:param proxies: format : ip:port ex) '192.168.0.1:9999'
:return: driver applied proxy
"""
# copy desired_capabilities
desired_capabilities = platform_desired_capabilities[platform].copy()
# set proxy
desired_capabilities['proxy'] = {
'httpProxy': proxies,
'ftpProxy': proxies,
'sslProxy': proxies,
'noProxy': None,
'proxyType': 'MANUAL',
# 'autodetect': False
# 'autodetect': True
}
# return driver applied proxy
if platform == 'ie':
return platform_webdriver[platform](executable_path=driver_path[platform],
capabilities=desired_capabilities)
if driver_path[platform]:
return platform_webdriver[platform](executable_path=driver_path[platform],
desired_capabilities=desired_capabilities)
# for firefox
else:
return platform_webdriver[platform](capabilities=desired_capabilities)
def get_proxy_from_file(filename):
"""
:param filename:
:return (ip, port): string, string
if ip, port or filename is invalid, return (None, None)
"""
proxy_lists = [line.replace('\n', '') for line in open(filename) if re_ip.search(line)]
if proxy_lists:
m = re_ip.search(proxy_lists[random.randint(0, len(proxy_lists) - 1)])
if m:
return m.group(1), m.group(2)
return (None, None)
def get_proxy_from_db():
try:
conn = pymysql.connect(host='bigbird.iptime.org',
user='admin', passwd='admin123',
db='concepters', charset='utf8',
cursorclass=pymysql.cursors.DictCursor)
with conn.cursor() as cursor:
cursor.execute("select * from Proxy")
proxy_lists = [(str(i['Proxy']), str(i['Port'])) for i in cursor if i['Port']]
conn.close()
return proxy_lists[random.randint(0, len(proxy_lists) - 1)] if proxy_lists else (None, None)
except:
conn.close()
return (None, None)
def get_proxy():
if os.path.exists(proxy_filename) and os.path.isfile(proxy_filename):
return get_proxy_from_file(proxy_filename)
else:
return get_proxy_from_db()
def get_requests_proxy(proxies):
return {'http': 'http://' + proxies, 'https': 'http://' + proxies}
def get_proxy_for_requests():
ip, port = get_proxy()
return get_requests_proxy(ip + ":" + port)

View File

@@ -10,7 +10,10 @@ import insta.instaparser as instaparser
import insta.instaheaders as instaheaders
import requests
import logging
# from multiprocessing import Queue
# import multiprocessing
from queue import Queue
import threading
from base.baseclasses import SendtoDB
from base.baseclasses import CrawlInit
@@ -18,6 +21,7 @@ from base.baseclasses import wait
from base.baseclasses import Browser
from selenium.webdriver.common.keys import Keys
from base.baseclasses import enter_element
import base.proxy
def printl(*objects, sep=' ', end='\n', file=None, flush=True):
@@ -29,6 +33,13 @@ insta_query = "https://www.instagram.com/query/"
insta_body_url = 'https://www.instagram.com/p/'
is_debuging = False
is_debug = False
def printd(*objects, sep=' ', end='\n', file=None, flush=True):
if is_debug:
print(*objects, sep=sep, end=end, file=file, flush=flush)
num_of_list_ajax = 24
num_of_reply_ajax = 100
@@ -36,7 +47,9 @@ list_wait_sec = 0.9
body_wait_sec = 0.5
reply_wait_sec = 0.8
num_of_page_down = 20
num_of_content_process = 10
requests_timeout = 60
num_of_retry_proxy = 5
logging.basicConfig(level=logging.INFO,
format="%(module)s(%(lineno)s):%(funcName)s:%(message)s")
@@ -62,6 +75,240 @@ def focus_driver(driver):
driver.set_window_position(position['x'], position['y'])
def instance_wrapper(func):
# to save nice ip, port of proxy
ip, port = base.proxy.get_proxy()
def retry_load(*args, **kwargs):
while True:
# use clouser
nonlocal ip, port
proxies = base.proxy.get_requests_proxy(ip + ":" + port)
kwargs['proxies'] = proxies
# retry = num_of_retry_proxy
# while retry:
res = func(*args, **kwargs)
if res:
# printl("id : {2} - connect success - {0}:{1}".format(ip, port, threading.get_ident()))
return res
# if the proxy was not good, get new proxy
# printl('id : {2} - connect failed - {0}:{1}'.format(ip, port, threading.get_ident()))
ip, port = base.proxy.get_proxy()
# retry -= 1
return retry_load
class InstanceWrapper(object):
def __init__(self, func):
self.ip, self.port = base.proxy.get_proxy()
self.func = func
self.num_of_retry_proxy = num_of_retry_proxy
def do(self, *args, **kwargs):
while True:
proxies = base.proxy.get_requests_proxy(self.ip + ":" + self.port)
kwargs['proxies'] = proxies
# retry = num_of_retry_proxy
# while retry:
res = self.func(*args, **kwargs)
if res:
# printd("id : {2} - connect success - {0}:{1}".format(self.ip, self.port, threading.get_ident()))
return res
# if the proxy was not good, get new proxy
# printd('id : {2} - connect failed - {0}:{1}'.format(self.ip, self.port, threading.get_ident()))
self.ip, self.port = base.proxy.get_proxy()
# retry -= 1
def do_retry(self, *args, **kwargs):
while True:
proxies = base.proxy.get_requests_proxy(self.ip + ":" + self.port)
kwargs['proxies'] = proxies
retry = self.num_of_retry_proxy
while retry:
res = self.func(*args, **kwargs)
if res:
# printl("id : {2} - connect success - {0}:{1}".format(self.ip, self.port, threading.get_ident()))
return res
# if the proxy was not good, get new proxy
# printl('id : {2} - connect failed - {0}:{1}'.format(self.ip, self.port, threading.get_ident()))
retry -= 1
self.ip, self.port = base.proxy.get_proxy()
def do_no_proxy(self, *args, **kwargs):
while True:
retry = self.num_of_retry_proxy
while retry:
proxies = base.proxy.get_requests_proxy(self.ip + ":" + self.port)
kwargs['proxies'] = proxies
res = self.func(*args, **kwargs)
if res:
printd("id : {2} - connect success - {0}:{1}".format(self.ip, self.port, threading.get_ident()))
return res
# if the proxy was not good, get new proxy
printd('id : {2} - connect failed - {0}:{1}'.format(self.ip, self.port, threading.get_ident()))
retry -= 1
self.ip, self.port = base.proxy.get_proxy()
# if get content with proxy failed, set no proxy
# func guarantee returning a instance except the case where a url is invalid
kwargs['proxies'] = None
res = self.func(*args, **kwargs)
# if res:
# printl("id : {2} - connect success - {0}:{1}".format('None', 'None', threading.get_ident()))
# printl(args, kwargs)
printd("id : {2} - connect success - {0}:{1}".format('None', 'None', threading.get_ident()))
return res
def change_proxy(self):
self.ip, self.port = base.proxy.get_proxy()
@instance_wrapper
def make_list_instance(url, proxies=None):
try:
if insta_tag_url in url:
list_crawler = ListTag(url, proxies)
else:
list_crawler = ListUser(url, proxies)
return list_crawler
except:
return None
# @instance_wrapper
def make_content_instance(url, proxies=None):
try:
content = InstaContent(url, {}, url, proxies)
return content
except:
return None
def ajax_wrapper(func):
def retry_ajax_load(*args, **kwargs):
retry = num_of_retry_proxy
while retry:
res = func(*args, **kwargs)
if res is not None:
break
retry -= 1
return res
return retry_ajax_load
@ajax_wrapper
def load_ajax_list(ins):
try:
insta_list = ins.load_more()
# if insta_list:
# return insta_list
# else:
# return None
return insta_list
except:
return None
@ajax_wrapper
def load_ajax_reply(ins):
try:
replies = ins.load_reply_more()
# if replies:
# return replies
# else:
# return None
return replies
except:
return None
# def crawl_content_process(qu, keyword_id, db_num):
# send_to_db = SendtoDB()
# send_to_db.set_db(db_num)
# while True:
# element = qu.get()
# if element is None:
# break
# ok = True
# while ok:
# try:
# ip, port = base.proxy.get_proxy()
# proxies = base.proxy.get_requests_proxy(ip + ":" + port)
# content = InstaContent(element['url'], {}, element['url'], proxies)
# body = content.get_body()
# replies = content.get_reply()
# body['article_url'] = element['url']
# body['keyword_id'] = keyword_id
# while content.has_previous:
# replies = content.load_reply_more() + replies
# wait(reply_wait_sec)
# for j in range(0, len(replies)):
# replies[j]['article_url'] = body['article_url']
# replies[j]['platform_id'] = body['platform_id']
# replies[j]['article_order'] = j
# send_to_db.delete_url(body['article_url'])
# send_to_db.send_body(body)
# if replies:
# send_to_db.send_reply(replies)
# printl(element['url'])
# printl('ok')
# ok = False
# except:
# printl("failed proxy {0}:{1}".format(ip, port))
# printl('finish thread')
def crawl_content_process(qu, keyword_id, db_num):
# m_c_i = instance_wrapper(make_content_instance)
m_c_i = InstanceWrapper(make_content_instance)
send_to_db = SendtoDB()
send_to_db.set_db(db_num)
while True:
element = qu.get()
if element is None:
break
ok = True
while ok:
try:
# get a instance of InstaContent by do_no_proxy func.
# if element['url'] is invalid, content is None
content = m_c_i.do_no_proxy(element['url'])
if not content:
break
body = content.get_body()
replies = content.get_reply()
body['article_url'] = element['url']
body['keyword_id'] = keyword_id
while content.has_previous:
rep = load_ajax_reply(content)
if rep is None:
printl("proxies = ", content.proxies)
m_c_i.change_proxy()
raise Exception("reply load error")
replies = rep + replies
wait(reply_wait_sec)
for j in range(0, len(replies)):
replies[j]['article_url'] = body['article_url']
replies[j]['platform_id'] = body['platform_id']
replies[j]['article_order'] = j
send_to_db.delete_url(body['article_url'])
send_to_db.send_body(body)
if replies:
send_to_db.send_reply(replies)
printl(element['url'])
printl('ok')
ok = False
except UnicodeEncodeError as ue:
printl(element['url'])
printl(ue)
break
except Exception as e:
# catch error when send_to_db error occur
printl(element['url'])
printl(e)
printl('finish thread')
class InstaInit(CrawlInit):
def __init__(self, before_day=0):
super().__init__(before_day)
@@ -107,7 +354,7 @@ class InstaInit(CrawlInit):
class ListTag:
def __init__(self, url):
def __init__(self, url, proxies=None):
self.__r = None
self.__tag = ''
self.__url = ''
@@ -115,54 +362,31 @@ class ListTag:
self.end_cursor = None
self.has_next = False
self.cookies = {}
self.load_url(url)
def load_url(self, url):
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html())
if is_debuging:
printl("<ListTag Start>")
printl("<ListTag requests>")
printl('headers = ', end=' ')
printl(instaheaders.get_headers_for_list_html())
self.proxies = proxies
self.load_url(url, self.proxies)
def load_url(self, url, proxies):
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html(), proxies=proxies,
timeout=requests_timeout)
self.log_load_url_before()
self.__r.raise_for_status()
self.__tag = self.__get_tag(url)
self.__set_cookies(self.__r.cookies)
self.__url = url
self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(self.__r.content)
if is_debuging:
printl("<ListTag response>")
printl('self.__r.cookies=', end='')
printl(self.__r.cookies)
printl('end_cursor = ' + str(self.end_cursor))
printl('has_next = ', end='')
printl(self.has_next)
printl("<ListTag End>")
self.log_load_url_after()
return self.list_tag
def load_more(self):
form_data = instaheaders.get_form_data_for_list_tag(self.__tag, self.end_cursor, num_of_list_ajax)
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data)
if is_debuging:
printl("<ListTag Start>")
printl("<ListTag requests>")
printl('end_cursor = ' + str(self.end_cursor))
printl('form_data' + form_data)
printl('headers = ', end=' ')
printl(headers)
self.__r = requests.post(insta_query, headers=headers, data=form_data)
self.log_load_more_before(form_data, headers)
self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies,
timeout=requests_timeout)
self.__set_cookies(self.__r.cookies)
self.__r.raise_for_status()
self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content)
if is_debuging:
printl("<ListTag response>")
printl('self.__r.cookies=', end='')
printl(self.__r.cookies)
printl('end_cursor = ' + str(self.end_cursor))
printl('has_next = ', end='')
printl(self.has_next)
printl("<ListTag End>")
self.log_load_more_after()
return self.list_tag
def __get_tag(self, url):
@@ -191,9 +415,52 @@ class ListTag:
def get_list(self):
return self.list_tag
def get_proxy(self):
return self.proxies
def log_load_url_before(self):
if is_debuging:
printl("<ListTag Start>")
printl("<ListTag requests>")
printl('headers = ', end=' ')
printl(instaheaders.get_headers_for_list_html())
def log_load_url_after(self):
if is_debuging:
printl("<ListTag response>")
printl('self.__r.cookies=', end='')
printl(self.__r.cookies)
printl('end_cursor = ' + str(self.end_cursor))
printl('has_next = ', end='')
printl(self.has_next)
printl('proxies = ', end='')
printl(self.proxies)
printl("<ListTag End>")
def log_load_more_before(self, form_data, headers):
if is_debuging:
printl("<ListTag Start>")
printl("<ListTag requests>")
printl('end_cursor = ' + str(self.end_cursor))
printl('form_data' + form_data)
printl('headers = ', end=' ')
printl(headers)
def log_load_more_after(self):
if is_debuging:
printl("<ListTag response>")
printl('self.__r.cookies=', end='')
printl(self.__r.cookies)
printl('end_cursor = ' + str(self.end_cursor))
printl('has_next = ', end='')
printl(self.has_next)
printl('proxies = ', end='')
printl(self.proxies)
printl("<ListTag End>")
class ListUser:
def __init__(self, url):
def __init__(self, url, proxies=None):
self.__r = None
self.__user = ''
self.__url = ''
@@ -201,10 +468,12 @@ class ListUser:
self.end_cursor = None
self.has_next = False
self.cookies = {}
self.load_url(url)
self.proxies = proxies
self.load_url(url, self.proxies)
def load_url(self, url):
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html())
def load_url(self, url, proxies):
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html(), proxies=proxies,
timeout=requests_timeout)
self.__r.raise_for_status()
self.__url = url
self.__set_cookies(self.__r.cookies)
@@ -214,26 +483,14 @@ class ListUser:
def load_more(self):
form_data = instaheaders.get_form_data_for_list_user(self.__user, self.end_cursor, num_of_list_ajax)
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data)
if is_debuging:
printl("<ListUser Start>")
printl("<ListUser requests>")
printl('end_cursor = ' + str(self.end_cursor))
printl('form_data' + form_data)
printl('headers = ', end=' ')
printl(headers)
self.__r = requests.post(insta_query, headers=headers, data=form_data)
self.log_load_more_before(form_data, headers)
self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies,
timeout=requests_timeout)
self.__r.raise_for_status()
self.__set_cookies(self.__r.cookies)
if is_debuging:
printl("<ListUser response>")
printl('self.__r.cookies=', end='')
printl(self.__r.cookies)
printl('end_cursor = ' + str(self.end_cursor))
printl('has_next = ', end='')
printl(self.has_next)
printl("<ListUser End>")
self.list_user, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content)
self.log_load_more_after()
return self.list_user
def get_cookies(self):
@@ -255,9 +512,33 @@ class ListUser:
def get_list(self):
return self.list_user
def get_proxy(self):
return self.proxies
def log_load_more_before(self, form_data, headers):
if is_debuging:
printl("<ListUser Start>")
printl("<ListUser requests>")
printl('end_cursor = ' + str(self.end_cursor))
printl('form_data' + form_data)
printl('headers = ', end=' ')
printl(headers)
def log_load_more_after(self):
if is_debuging:
printl("<ListUser response>")
printl('self.__r.cookies=', end='')
printl(self.__r.cookies)
printl('end_cursor = ' + str(self.end_cursor))
printl('has_next = ', end='')
printl(self.has_next)
printl('proxies = ', end='')
printl(self.proxies)
printl("<ListUser End>")
class InstaContent:
def __init__(self, url, cookies, referer):
def __init__(self, url, cookies, referer, proxies=None):
self.__r = None
self.__referer = ''
self.__code = ''
@@ -266,11 +547,13 @@ class InstaContent:
self.start_cursor = None
self.has_previous = False
self.cookies = {}
self.load_url(url, cookies, referer)
self.proxies = proxies
self.load_url(url, cookies, referer, self.proxies)
def load_url(self, url, cookies, referer):
def load_url(self, url, cookies, referer, proxies):
self.__set_cookies(cookies)
self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies))
self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies), proxies=proxies,
timeout=requests_timeout)
self.__r.raise_for_status()
self.__referer = referer
self.__code = self.__get_code(url)
@@ -287,25 +570,13 @@ class InstaContent:
def load_reply_more(self):
form_data = instaheaders.get_form_data_for_reply(self.__code, self.start_cursor, num_of_reply_ajax)
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__referer, form_data)
if is_debuging:
printl("<ContentReply Start>")
printl("<ContentReply requests>")
printl('start_cursor = ' + self.start_cursor)
printl('form_data' + form_data)
printl('headers = ', end=' ')
printl(headers)
self.__r = requests.post(insta_query, headers=headers, data=form_data)
self.log_load_reply_more_before(form_data, headers)
self.__r = requests.post(insta_query, headers=headers, data=form_data, proxies=self.proxies,
timeout=requests_timeout)
self.__r.raise_for_status()
self.__set_cookies(self.__r.cookies)
self.reply, self.start_cursor, self.has_previous = instaparser.parse_reply_ajax(self.__r.content)
if is_debuging:
printl("<ContentReply response>")
printl('self.__r.cookies=', end='')
printl(self.__r.cookies)
printl('start_cursor = ' + str(self.start_cursor))
printl('has_previous = ', end='')
printl(self.has_previous)
printl("<ContentReply End>")
self.log_load_reply_more_after()
return self.reply
def get_cookies(self):
@@ -322,6 +593,30 @@ class InstaContent:
for k, v in cookies.items():
self.cookies[k] = v
def get_proxy(self):
return self.proxies
def log_load_reply_more_before(self, form_data, headers):
if is_debuging:
printl("<ContentReply Start>")
printl("<ContentReply requests>")
printl('start_cursor = ' + self.start_cursor)
printl('form_data' + form_data)
printl('headers = ', end=' ')
printl(headers)
def log_load_reply_more_after(self):
if is_debuging:
printl("<ContentReply response>")
printl('self.__r.cookies=', end='')
printl(self.__r.cookies)
printl('start_cursor = ' + str(self.start_cursor))
printl('has_previous = ', end='')
printl(self.has_previous)
printl('proxies = ', end='')
printl(self.proxies)
printl("<ContentReply End>")
class InstaAlgorithm:
def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id,
@@ -474,6 +769,123 @@ class InstaAlgorithmNormal(InstaAlgorithm):
printl("Finished Crawling :)")
class InstaAlgorithmMulti(InstaAlgorithm):
def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id,
reload_wait_second=2, num_of_load_content=12, page_down=50):
super().__init__(driver, browser, crawl_init, send_to_db, keyword_id,
reload_wait_second, num_of_load_content, page_down)
if self.driver:
self.driver.quit()
self.list_crawl = Queue()
self.total_num = 0
def crawl_contents(self, contents_list, backup_set):
"""
:param contents_list:
:param backup_set:
:return: is_load_more
"""
old_elements = 0
for element in contents_list:
if element['date'].date() > self.crawl_init.get_end_day():
# printl(element['url'])
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
elif element['date'].date() < self.crawl_init.get_begin_day():
printl(element['url'])
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
old_elements += 1
if old_elements > 6:
return False
else:
if not element['url'] in backup_set:
# printl(element['url'])
# printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
# wait(1.5)
# self.crawl_content(element['url'], list_crawler.get_cookies(), list_crawler.get_url())
self.list_crawl.put(element)
backup_set.add(element['url'])
self.total_num += 1
if self.is_until_page():
return False
# if self.list_crawl:
# printl("Number of Lists = {0}".format(len(self.list_crawl)))
return True
def crawl(self):
real_time = True
while real_time:
printl("Crawling Start")
url_list = self.crawl_init.make_url()
i = 0
end_cursor = None
backup_set = set()
while i < len(url_list):
# first connect
try:
printl(url_list[i] + "\n")
# insta_content process create and start
# p_list = [multiprocessing.Process(target=crawl_content_process,
# args=(self.list_crawl, self.keyword_id, self.send_to_db.db_num))
# for i in range(num_of_content_process)]
p_list = [threading.Thread(target=crawl_content_process,
args=(self.list_crawl, self.keyword_id, self.send_to_db.db_num))
for i in range(num_of_content_process)]
for p in p_list:
p.daemon = True
p.start()
# crawl list
ok = True
while ok:
try:
list_crawler = make_list_instance(url_list[i])
ok = False
except Exception as e:
printl(e)
wait(1)
insta_list = list_crawler.get_list()
is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next
# ajax load
while is_load_more:
if end_cursor:
list_crawler.end_cursor = end_cursor
end_cursor = None
wait(self.reload_wait_second)
try:
insta_list = load_ajax_list(list_crawler)
if insta_list is None:
break
is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next
except Exception as e:
printl('is_load_more exception')
printl(e)
is_load_more = False
#self.crawl_list()
#self.list_crawl.close()
printl("end load")
printl("total number of crawled list = {0}".format(self.total_num))
self.total_num = 0
# stop child process
for i in range(num_of_content_process):
self.list_crawl.put(None)
# wait child process
for p in p_list:
p.join()
i += 1
except Exception as e:
logging.info(e)
end_cursor = list_crawler.end_cursor
printl('end_cursor=' + end_cursor)
if e.args:
wait(300)
real_time = self.crawl_init.is_realtime()
printl("Finished Crawling :)")
class InstaAlgorithmBrowser(InstaAlgorithm):
def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id,
reload_wait_second=2, num_of_load_content=12, page_down=50):
@@ -548,7 +960,8 @@ class InstaMainCrawler:
def __init__(self):
self.send_to_db = SendtoDB()
self.crawl_init = InstaInit()
self.browser = Browser()
# self.browser = Browser()
self.browser = None
self.driver = None
def set_keyword_id(self, keyword_id):
@@ -565,7 +978,7 @@ class InstaMainCrawler:
self.init_db(db_num)
self.init_before_day(before_day)
self.init_until_page(until_page)
self.init_browser(browser)
# self.init_browser(browser)
def set_driver(self, driver):
self.driver = driver
@@ -594,10 +1007,12 @@ class InstaMainCrawler:
self.crawl_init.set_until_page(until_page)
def crawler_start(self):
if self.driver:
algorithm = InstaAlgorithmBrowser(self.driver, self.browser, self.crawl_init, self.send_to_db,
self.keyword_id, list_wait_sec, num_of_list_ajax, num_of_page_down)
else:
algorithm = InstaAlgorithmNormal(self.driver, self.browser, self.crawl_init, self.send_to_db,
self.keyword_id, list_wait_sec, num_of_list_ajax, num_of_page_down)
# if self.driver:
# algorithm = InstaAlgorithmBrowser(self.driver, self.browser, self.crawl_init, self.send_to_db,
# self.keyword_id, list_wait_sec, num_of_list_ajax, num_of_page_down)
# else:
# algorithm = InstaAlgorithmNormal(self.driver, self.browser, self.crawl_init, self.send_to_db,
# self.keyword_id, list_wait_sec, num_of_list_ajax, num_of_page_down)
algorithm = InstaAlgorithmMulti(self.driver, self.browser, self.crawl_init, self.send_to_db,
self.keyword_id, list_wait_sec, num_of_list_ajax, num_of_page_down)
algorithm.start_crawl()

View File

@@ -0,0 +1,603 @@
#-*- coding: utf-8 -*-
'''
Created on 2015. 12. 8.
@author: cococo
'''
import re
import datetime
import insta.instaparser as instaparser
import insta.instaheaders as instaheaders
import requests
import logging
from base.baseclasses import SendtoDB
from base.baseclasses import CrawlInit
from base.baseclasses import wait
from base.baseclasses import Browser
from selenium.webdriver.common.keys import Keys
from base.baseclasses import enter_element
def printl(*objects, sep=' ', end='\n', file=None, flush=True):
print(*objects, sep=sep, end=end, file=file, flush=flush)
insta_url = "https://www.instagram.com/"
insta_tag_url = "https://www.instagram.com/explore/tags/"
insta_query = "https://www.instagram.com/query/"
insta_body_url = 'https://www.instagram.com/p/'
is_debuging = False
num_of_list_ajax = 24
num_of_reply_ajax = 100
list_wait_sec = 0.9
body_wait_sec = 0.5
reply_wait_sec = 0.8
num_of_page_down = 20
logging.basicConfig(level=logging.INFO,
format="%(module)s(%(lineno)s):%(funcName)s:%(message)s")
logging.getLogger('requests').setLevel(logging.WARNING)
logging.getLogger('pymysql').setLevel(logging.WARNING)
def click_insta_load_more(driver):
element = driver.find_element_by_css_selector("div._pupj3 > a")
enter_element(element)
def push_page_down(driver):
body = driver.find_element_by_tag_name('body')
body.send_keys(Keys.PAGE_DOWN)
def focus_driver(driver):
position = driver.get_window_position()
size = driver.get_window_size()
driver.maximize_window()
driver.set_window_size(size['width'], size["height"])
driver.set_window_position(position['x'], position['y'])
class InstaInit(CrawlInit):
def __init__(self, before_day=0):
super().__init__(before_day)
self.urls = dict()
self.urls[9] = insta_tag_url
self.urls[10] = insta_url
def split_searches(self):
search = self.searches()
splited_list = search.split(',')
trimmed_list = list()
if self.platform() == 10:
for x in splited_list:
trimmed_list.append(x.strip())
else:
for x in splited_list:
trimmed_list.append(self.utf8(x))
return trimmed_list
def make_url(self):
urls = list()
for x in self.split_searches():
url = self.urls[self.platform()] + x
urls.append(url)
return urls
def get_begin_day(self):
if self.is_realtime():
date_now = datetime.datetime.now()
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
result += datetime.timedelta(days=self.before_day)
return result.date()
else:
return self.start_day()
def get_end_day(self):
if self.is_realtime():
date_now = datetime.datetime.now()
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
return result.date()
else:
return self.end_day()
class ListTag:
def __init__(self, url):
self.__r = None
self.__tag = ''
self.__url = ''
self.list_tag = []
self.end_cursor = None
self.has_next = False
self.cookies = {}
self.load_url(url)
def load_url(self, url):
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html())
if is_debuging:
printl("<ListTag Start>")
printl("<ListTag requests>")
printl('headers = ', end=' ')
printl(instaheaders.get_headers_for_list_html())
self.__r.raise_for_status()
self.__tag = self.__get_tag(url)
self.__set_cookies(self.__r.cookies)
self.__url = url
self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_tag_html(self.__r.content)
if is_debuging:
printl("<ListTag response>")
printl('self.__r.cookies=', end='')
printl(self.__r.cookies)
printl('end_cursor = ' + str(self.end_cursor))
printl('has_next = ', end='')
printl(self.has_next)
printl("<ListTag End>")
return self.list_tag
def load_more(self):
form_data = instaheaders.get_form_data_for_list_tag(self.__tag, self.end_cursor, num_of_list_ajax)
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data)
if is_debuging:
printl("<ListTag Start>")
printl("<ListTag requests>")
printl('end_cursor = ' + str(self.end_cursor))
printl('form_data' + form_data)
printl('headers = ', end=' ')
printl(headers)
self.__r = requests.post(insta_query, headers=headers, data=form_data)
self.__set_cookies(self.__r.cookies)
self.__r.raise_for_status()
self.list_tag, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content)
if is_debuging:
printl("<ListTag response>")
printl('self.__r.cookies=', end='')
printl(self.__r.cookies)
printl('end_cursor = ' + str(self.end_cursor))
printl('has_next = ', end='')
printl(self.has_next)
printl("<ListTag End>")
return self.list_tag
def __get_tag(self, url):
m = re.search(insta_tag_url + "([^/]*)", url)
if m:
return m.group(1)
else:
raise RuntimeError('Tag Error')
def get_cookies(self):
return self.cookies
def get_url(self):
return self.__url
def set_end_cursor(self, cursor):
self.end_cursor = cursor
def get_end_cursor(self):
return self.end_cursor
def __set_cookies(self, cookies):
for k, v in cookies.items():
self.cookies[k] = v
def get_list(self):
return self.list_tag
class ListUser:
def __init__(self, url):
self.__r = None
self.__user = ''
self.__url = ''
self.list_user = []
self.end_cursor = None
self.has_next = False
self.cookies = {}
self.load_url(url)
def load_url(self, url):
self.__r = requests.get(url, headers=instaheaders.get_headers_for_list_html())
self.__r.raise_for_status()
self.__url = url
self.__set_cookies(self.__r.cookies)
self.list_user, self.end_cursor, self.has_next, self.__user = instaparser.parse_list_user_html(self.__r.content)
return self.list_user
def load_more(self):
form_data = instaheaders.get_form_data_for_list_user(self.__user, self.end_cursor, num_of_list_ajax)
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__url, form_data)
if is_debuging:
printl("<ListUser Start>")
printl("<ListUser requests>")
printl('end_cursor = ' + str(self.end_cursor))
printl('form_data' + form_data)
printl('headers = ', end=' ')
printl(headers)
self.__r = requests.post(insta_query, headers=headers, data=form_data)
self.__r.raise_for_status()
self.__set_cookies(self.__r.cookies)
if is_debuging:
printl("<ListUser response>")
printl('self.__r.cookies=', end='')
printl(self.__r.cookies)
printl('end_cursor = ' + str(self.end_cursor))
printl('has_next = ', end='')
printl(self.has_next)
printl("<ListUser End>")
self.list_user, self.end_cursor, self.has_next = instaparser.parse_list_ajax(self.__r.content)
return self.list_user
def get_cookies(self):
return self.cookies
def get_url(self):
return self.__url
def set_end_cursor(self, cursor):
self.end_cursor = cursor
def get_end_cursor(self):
return self.end_cursor
def __set_cookies(self, cookies):
for k, v in cookies.items():
self.cookies[k] = v
def get_list(self):
return self.list_user
class InstaContent:
def __init__(self, url, cookies, referer):
self.__r = None
self.__referer = ''
self.__code = ''
self.body = None
self.reply = []
self.start_cursor = None
self.has_previous = False
self.cookies = {}
self.load_url(url, cookies, referer)
def load_url(self, url, cookies, referer):
self.__set_cookies(cookies)
self.__r = requests.get(url, headers=instaheaders.get_headers_for_body_html(self.cookies))
self.__r.raise_for_status()
self.__referer = referer
self.__code = self.__get_code(url)
self.body, self.reply, self.start_cursor, self.has_previous = instaparser.parse_body_html(self.__r.content)
self.__set_cookies(self.__r.cookies)
return self.body, self.reply
def get_body(self):
return self.body
def get_reply(self):
return self.reply
def load_reply_more(self):
form_data = instaheaders.get_form_data_for_reply(self.__code, self.start_cursor, num_of_reply_ajax)
headers = instaheaders.get_headers_for_ajax(self.cookies, self.__referer, form_data)
if is_debuging:
printl("<ContentReply Start>")
printl("<ContentReply requests>")
printl('start_cursor = ' + self.start_cursor)
printl('form_data' + form_data)
printl('headers = ', end=' ')
printl(headers)
self.__r = requests.post(insta_query, headers=headers, data=form_data)
self.__r.raise_for_status()
self.__set_cookies(self.__r.cookies)
self.reply, self.start_cursor, self.has_previous = instaparser.parse_reply_ajax(self.__r.content)
if is_debuging:
printl("<ContentReply response>")
printl('self.__r.cookies=', end='')
printl(self.__r.cookies)
printl('start_cursor = ' + str(self.start_cursor))
printl('has_previous = ', end='')
printl(self.has_previous)
printl("<ContentReply End>")
return self.reply
def get_cookies(self):
return self.cookies
def __get_code(self, url):
m = re.search(insta_body_url + "([^/]*)", url)
if m:
return m.group(1)
else:
raise RuntimeError('Tag Error')
def __set_cookies(self, cookies):
for k, v in cookies.items():
self.cookies[k] = v
class InstaAlgorithm:
def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id,
reload_wait_second=2, num_of_load_content=12, page_down=50):
self.send_to_db = send_to_db
self.crawl_init = crawl_init
self.browser = browser
self.driver = driver
self.keyword_id = keyword_id
self.reload_wait_second = reload_wait_second
self.num_of_load_content = num_of_load_content
self.page_down = page_down
self.list_crawl = []
def crawl_content(self, url, cookies, referer):
content = InstaContent(url, cookies, referer)
body = content.get_body()
replies = content.get_reply()
body['article_url'] = url
body['keyword_id'] = self.keyword_id
# printl(body['article_url'])
while content.has_previous:
replies = content.load_reply_more() + replies
wait(reply_wait_sec)
for j in range(0, len(replies)):
replies[j]['article_url'] = body['article_url']
replies[j]['platform_id'] = body['platform_id']
replies[j]['article_order'] = j
self.send_to_db.delete_url(body['article_url'])
self.send_to_db.send_body(body)
if replies:
self.send_to_db.send_reply(replies)
printl('ok')
printl()
def start_crawl(self):
self.crawl()
self.close()
def close(self):
if self.driver and not is_debuging:
self.driver.quit()
self.send_to_db.close()
printl("Finished Crawling :)")
def crawl(self):
raise NotImplementedError
def is_until_page(self):
if self.crawl_init.until_page and self.crawl_init.until_page <= len(self.list_crawl):
return True
else:
return False
def crawl_contents(self, contents_list, backup_set):
"""
:param contents_list:
:param backup_set:
:return: is_load_more
"""
old_elements = 0
for element in contents_list:
if element['date'].date() > self.crawl_init.get_end_day():
# printl(element['url'])
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
elif element['date'].date() < self.crawl_init.get_begin_day():
printl(element['url'])
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
old_elements += 1
if old_elements > 6:
return False
else:
if not element['url'] in backup_set:
# printl(element['url'])
# printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
# wait(1.5)
# self.crawl_content(element['url'], list_crawler.get_cookies(), list_crawler.get_url())
self.list_crawl.append(element)
backup_set.add(element['url'])
if self.is_until_page():
return False
if self.list_crawl:
printl("Number of Lists = {0}".format(len(self.list_crawl)))
return True
def crawl_list(self):
if self.list_crawl:
printl()
printl("Start Days = {0}".format(self.list_crawl[0]['date'].strftime("%Y-%m-%d %H:%M:%S")))
printl("End Days = {0}".format(self.list_crawl[-1]['date'].strftime("%Y-%m-%d %H:%M:%S")))
printl("Total gathered contents = {0}".format(len(self.list_crawl)))
printl()
for element in self.list_crawl:
try:
printl(element['url'])
printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
wait(body_wait_sec)
self.crawl_content(element['url'], {}, element['url'])
except Exception as e:
printl(e)
logging.info(e)
class InstaAlgorithmNormal(InstaAlgorithm):
def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id,
reload_wait_second=2, num_of_load_content=12, page_down=50):
super().__init__(driver, browser, crawl_init, send_to_db, keyword_id,
reload_wait_second, num_of_load_content, page_down)
if self.driver:
self.driver.quit()
def crawl(self):
real_time = True
while real_time:
printl("Crawling Start")
url_list = self.crawl_init.make_url()
i = 0
end_cursor = None
backup_set = set()
while i < len(url_list):
# first connect
try:
printl(url_list[i] + "\n")
if insta_tag_url in url_list[i]:
list_crawler = ListTag(url_list[i])
else:
list_crawler = ListUser(url_list[i])
wait(1)
insta_list = list_crawler.get_list()
is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next
# ajax load
while is_load_more:
if end_cursor:
list_crawler.end_cursor = end_cursor
end_cursor = None
wait(self.reload_wait_second)
insta_list = list_crawler.load_more()
is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next
self.crawl_list()
self.list_crawl.clear()
i += 1
except Exception as e:
logging.info(e)
end_cursor = list_crawler.end_cursor
printl('end_cursor=' + end_cursor)
if e.args:
wait(300)
real_time = self.crawl_init.is_realtime()
printl("Finished Crawling :)")
class InstaAlgorithmBrowser(InstaAlgorithm):
def __init__(self, driver, browser, crawl_init, send_to_db, keyword_id,
reload_wait_second=2, num_of_load_content=12, page_down=50):
super().__init__(driver, browser, crawl_init, send_to_db, keyword_id,
reload_wait_second, num_of_load_content, page_down)
def url_load(self, url):
if insta_tag_url in url:
list_tag = ListTag(url)
insta_list, end_cursor, has_next = instaparser.parse_list_tag_html(self.driver.page_source)
return list_tag, insta_list, end_cursor, has_next
else:
list_user = ListUser(url)
insta_list, end_cursor, has_next, user_id = instaparser.parse_list_user_html(self.driver.page_source)
return list_user, insta_list, end_cursor, has_next
def crawl(self):
real_time = True
while real_time:
url_list = self.crawl_init.make_url()
i = 0
end_cursor = None
backup_set = set()
while i < len(url_list):
# first connect
try:
wait(3)
printl(url_list[i] + "\n")
self.driver.get(url_list[i])
wait(5)
list_crawler, insta_list, end_cursor2, has_next = self.url_load(url_list[i])
is_load_more = self.crawl_contents(insta_list, backup_set) and has_next
list_crawler.set_end_cursor(end_cursor2)
list_crawler.cookies = {i['name']: i['value'] for i in self.driver.get_cookies()}
# ajax load
page_down = 0
while is_load_more:
if page_down == self.page_down:
page_down = 0
try:
focus_driver(self.driver)
click_insta_load_more(self.driver)
except:
push_page_down(self.driver)
page_down += 1
if end_cursor:
list_crawler.end_cursor = end_cursor
end_cursor = None
wait(self.reload_wait_second)
insta_list = list_crawler.load_more()
# printl("list length = " + str(len(insta_list)))
is_load_more = self.crawl_contents(insta_list, backup_set) and list_crawler.has_next
# printl("number of backup_set = {0}".format(len(backup_set)))
i += 1
self.crawl_list()
self.list_crawl.clear()
except Exception as e:
logging.info(e)
end_cursor = list_crawler.end_cursor
printl('end_cursor=' + end_cursor)
if e.args:
wait(300)
if self.driver:
self.driver.close()
wait(3)
self.driver = self.browser.new_browser()
real_time = self.crawl_init.is_realtime()
printl("Finished Crawling :)")
class InstaMainCrawler:
def __init__(self):
self.send_to_db = SendtoDB()
self.crawl_init = InstaInit()
self.browser = Browser()
self.driver = None
def set_keyword_id(self, keyword_id):
self.keyword_id = keyword_id
def crawl_all(self, backup_set=None):
pass
def start(self):
self.crawler_start()
def set_arguments(self, browser, keyword_id, db_num, before_day, until_page):
self.init_keyword_id(keyword_id)
self.init_db(db_num)
self.init_before_day(before_day)
self.init_until_page(until_page)
self.init_browser(browser)
def set_driver(self, driver):
self.driver = driver
def init_browser(self, browser):
try:
self.set_driver(self.browser.get_new_driver(browser))
except Exception as e:
logging.info(e)
def init_keyword_id(self, keyword_id):
if type(keyword_id) != int:
self.keyword_id = int(keyword_id)
else:
self.keyword_id = keyword_id
self.crawl_init.get_keyword_parameters(keyword_id)
self.crawl_init.disconnect()
def init_db(self, db_num):
self.send_to_db.set_db(db_num)
def init_before_day(self, before_day):
self.crawl_init.set_before_day(before_day)
def init_until_page(self, until_page):
self.crawl_init.set_until_page(until_page)
def crawler_start(self):
if self.driver:
algorithm = InstaAlgorithmBrowser(self.driver, self.browser, self.crawl_init, self.send_to_db,
self.keyword_id, list_wait_sec, num_of_list_ajax, num_of_page_down)
else:
algorithm = InstaAlgorithmNormal(self.driver, self.browser, self.crawl_init, self.send_to_db,
self.keyword_id, list_wait_sec, num_of_list_ajax, num_of_page_down)
algorithm.start_crawl()

View File

@@ -8,14 +8,20 @@ from selenium.webdriver.common.keys import Keys
def pageup_and_pagedown(_driver):
body = _driver.find_element_by_tag_name('body')
for i in range(0, 2):
body.send_keys(Keys.PAGE_UP)
wait(0.2)
for i in range(0, 5):
body.send_keys(Keys.PAGE_DOWN)
wait(0.2)
# body = _driver.find_element_by_tag_name('body')
# for i in range(0, 2):
# body.send_keys(Keys.PAGE_UP)
# wait(3)
# for i in range(0, 5):
# body.send_keys(Keys.PAGE_DOWN)
# wait(3)
for i in range(0, 3):
_driver.execute_script("window.scrollBy(0, -300)")
wait(0.4)
for i in range(0, 5):
_driver.execute_script("window.scrollBy(0, 800)")
wait(0.4)
def first_load(_driver):
element = _driver.find_element_by_css_selector("div._pupj3 > a")
@@ -39,7 +45,7 @@ def remove_myci9(_driver):
browser = Browser()
driver = browser.get_new_driver('chrome')
driver = browser.get_new_driver('ie')
url_sets = set()
wait(5)
@@ -53,9 +59,9 @@ wait(5)
first_load(driver)
wait(3)
print(driver.get_cookies())
#print(driver.get_cookies())
with open("c:\\data\\instajumma.txt", 'w') as f:
with open("c:\\data\\instajummaie.txt", 'w') as f:
try:
while True:
for j in range(0, 10):