git-svn-id: svn://192.168.0.12/source@348 8346c931-da38-4b9b-9d4c-e48b93cbd075

This commit is contained in:
admin
2017-05-30 03:32:11 +00:00
parent dad0365a79
commit b873412ef2
9 changed files with 616 additions and 30 deletions

View File

@@ -1,5 +1,11 @@
[database] [#database]
user=root user=root
pass=1234 pass=1234
host=192.168.0.82 host=192.168.0.82
name=bigbird name=bigbird
[database]
user=admin
pass=con2214lac!
host=182.162.171.147
name=bigbird

View File

@@ -0,0 +1,86 @@
class InstaUrlValidator:
def __init__(self, input_url):
self.protocol = 'https'
self.host = 'www.instagram.com'
self.path1 = 'p'
self.input_user_key = ''
self.input_url = input_url
def preprocess_input_url(self):
if type(self.input_url) != str:
raise TypeError('input url error')
self.preprocessed_input_url = self.input_url.strip()
def check_protocol(self):
start_index = 0
end_index = self.preprocessed_input_url.find(':')
if end_index == -1:
return start_index
if self.preprocessed_input_url[end_index+1] != '/' or self.preprocessed_input_url[end_index+2] != '/':
raise ValueError('incorrect url format')
return end_index + 3
def check_host(self, start_index):
end_index = self.preprocessed_input_url.find('/', start_index)
if end_index == -1:
raise ValueError('incorrect url format')
input_host = self.preprocessed_input_url[start_index:end_index]
if input_host not in self.host:
raise ValueError('incorrect host')
return end_index + 1
def check_path1(self, start_index):
end_index = self.preprocessed_input_url.find('/', start_index)
if end_index == -1:
raise ValueError('incorrect path')
input_path1 = self.preprocessed_input_url[start_index:end_index]
if input_path1 != self.path1:
raise ValueError('incorrect path (/p/)')
return end_index + 1
def check_path2(self, start_index):
end_index = self.preprocessed_input_url.find('/', start_index)
# if end_index == -1:
# raise ValueError('incorrect path')
#
# self.input_user_key = self.preprocessed_input_url[start_index:end_index]
if end_index != -1:
self.input_user_key = self.preprocessed_input_url[start_index:end_index]
else:
self.input_user_key = self.preprocessed_input_url[start_index:]
def make_instagram_url(self):
if len(self.input_user_key) <= 0:
raise ValueError('incorrect user key')
url = self.protocol + '://' + self.host + '/' + self.path1 + '/' + self.input_user_key + '/'
return url
def validate_url(self):
try:
self.preprocess_input_url()
start_index = self.check_protocol()
start_index = self.check_host(start_index)
start_index = self.check_path1(start_index)
self.check_path2(start_index)
except Exception as e:
raise e
def get_insta_url(self):
try:
self.validate_url()
url = self.make_instagram_url()
except Exception as e:
raise e
return url

View File

@@ -36,6 +36,20 @@ insta_tag_url = "https://www.instagram.com/explore/tags/"
insta_query = "https://www.instagram.com/query/" insta_query = "https://www.instagram.com/query/"
insta_body_url = 'https://www.instagram.com/p/' insta_body_url = 'https://www.instagram.com/p/'
DATE = 0
REPLY_DAY = 1
REPLY_ACC = 2
LIKE_DAY = 3
LIKE_ACC = 4
DAY = 5
ACC = 6
REPLY = 7
LIKE = 8
BUZZ_KEY = [
"date", "reply_day", "reply_acc", "like_day", "like_acc", "day", "acc"
]
def requests_get(req, timeout=requests_timeout): def requests_get(req, timeout=requests_timeout):
body = [] body = []
@@ -136,10 +150,25 @@ class InstaContent:
class EffectInsta(object): class EffectInsta(object):
def __init__(self, event_num, event_code, url):
def __init__(self, event_num, event_code, url, start_date):
self.event_num = event_num self.event_num = event_num
self.event_code = event_code self.event_code = event_code
self.url = url self.url = url
self.start_date = start_date.replace("-", "")
self.database = self.database_init()
def database_init(self):
try:
cg = get_settings()
except Exception as e:
raise effect.effecterror.UnknownError(str(e) + '\n' + 'effect.ini setting error')
database = ResultSender(cg['host'], cg['user'], cg['pass'], cg['name']) if cg else ResultSender()
database.connect()
return database
def start(self): def start(self):
#content = insta.instacrawl.InstaContent(self.url, {}, self.url) #content = insta.instacrawl.InstaContent(self.url, {}, self.url)
@@ -187,8 +216,12 @@ class EffectInsta(object):
result['replycount'] = int(body.get('article_order'), 0) result['replycount'] = int(body.get('article_order'), 0)
result['likecount'] = int(body.get('reply_url'), 0) result['likecount'] = int(body.get('reply_url'), 0)
result['interactioncount'] = self.get_replycount(body, replies) result['interactioncount'] = self.get_replycount(body, replies)
result['replybuzz'] = self.get_reply_buzz(body, replies) replybuzz = self.get_reply_buzz(body, replies)
likebuzzs = self.get_like_buzz(int(body.get('reply_url'), 0))
totalbuzz = self.summary_reply_and_like(replybuzz, likebuzzs)
result['replybuzz'] = json.dumps(totalbuzz, sort_keys=True)
result['engagementcount'] = result.get('likecount', 0) + result.get('replycount', 0) result['engagementcount'] = result.get('likecount', 0) + result.get('replycount', 0)
return result return result
def get_replycount(self, body, replies): def get_replycount(self, body, replies):
@@ -197,22 +230,171 @@ class EffectInsta(object):
set_reply_id.add(i.get('article_id', '')) set_reply_id.add(i.get('article_id', ''))
return len(set_reply_id) - 1 if body.get('article_id') in set_reply_id else len(set_reply_id) return len(set_reply_id) - 1 if body.get('article_id') in set_reply_id else len(set_reply_id)
# def get_reply_buzz(self, body, replies):
# start_date = datetime.datetime.strptime(body['article_date'], '%Y-%m-%d %H:%M:%S').date()
# end_date = datetime.datetime.now().date()
# date_dict = dict()
# while start_date <= end_date:
# date_dict[start_date.strftime('%Y%m%d')] = 0
# start_date = start_date + datetime.timedelta(days=1)
#
# for reply in replies:
# str_reply_date = reply.get('article_date', '1990-01-01 00:00:00')
# reply_date = datetime.datetime.strptime(str_reply_date, '%Y-%m-%d %H:%M:%S').date().strftime('%Y%m%d')
# print(reply_date)
# if reply_date in date_dict:
# date_dict[reply_date] = date_dict[reply_date] + 1
#
# print(date_dict)
#
# json_array = [{'date': k, 'value': v} for k, v in date_dict.items()]
#
# return json.dumps(json_array, sort_keys=True)
def get_reply_buzz(self, body, replies): def get_reply_buzz(self, body, replies):
start_date = datetime.datetime.strptime(body['article_date'], '%Y-%m-%d %H:%M:%S').date() start_date = datetime.datetime.strptime(self.start_date, '%Y%m%d').date()
end_date = datetime.datetime.now().date() today = datetime.datetime.now().date()
date_dict = dict() date_dict = dict()
while start_date <= end_date: while start_date <= today:
date_dict[start_date.strftime('%Y%m%d')] = 0 date_dict[start_date.strftime('%Y%m%d')] = 0
start_date = start_date + datetime.timedelta(days=1) start_date = start_date + datetime.timedelta(days=1)
for reply in replies: for reply in replies:
str_reply_date = reply.get('article_date', '1990-01-01 00:00:00') str_reply_date = reply.get('article_date')
reply_date = datetime.datetime.strptime(str_reply_date, '%Y-%m-%d %H:%M:%S').date().strftime('%m-%d-%Y') reply_date = datetime.datetime.strptime(str_reply_date, '%Y-%m-%d %H:%M:%S').date().strftime('%Y%m%d')
if reply_date in date_dict: if reply_date in date_dict:
date_dict[reply_date] = date_dict[reply_date] + 1 date_dict[reply_date] = date_dict[reply_date] + 1
json_array = [{'date': k, 'value': v} for k, v in date_dict.items()] reply_buzzs = self.make_dummy_buzzs(self.start_date, datetime.datetime.today().strftime('%Y%m%d'))
reply_acc_count = 0
for reply_buzz in reply_buzzs:
date = reply_buzz[BUZZ_KEY[DATE]]
reply_count = date_dict[date]
reply_acc_count += reply_count
reply_buzz[BUZZ_KEY[DAY]] = date_dict[date]
reply_buzz[BUZZ_KEY[ACC]] = reply_acc_count
return json.dumps(json_array, sort_keys=True) # json_array = [{'date': k, 'value': v} for k, v in date_dict.items()]
# return json.dumps(json_array, sort_keys=True)
return reply_buzzs
def get_like_buzz(self, like_count):
start_date = datetime.datetime.strptime(self.start_date, '%Y%m%d').date()
today = datetime.datetime.today().strftime('%Y%m%d')
try:
buzzs = self.database.get_buzz(self.event_num)
if buzzs != None:
buzzs = json.loads(buzzs)
else:
buzzs = []
buzzs = self.get_buzzs(buzzs, LIKE)
like_dummy_buzzs = self.make_dummy_buzzs(self.start_date, today)
like_buzzs = self.fill_buzzs_into_dummy(buzzs, like_dummy_buzzs)
like_buzzs = self.put_today_buzz(like_buzzs, like_count)
except Exception as e:
raise effect.effecterror.DBQueryError(str(e))
return like_buzzs
def make_base_buzz_instance(self, values):
base_buzz_instance = dict()
base_buzz_instance[BUZZ_KEY[DATE]] = values[0]
base_buzz_instance[BUZZ_KEY[DAY]] = values[1]
base_buzz_instance[BUZZ_KEY[ACC]] = values[2]
return base_buzz_instance
def make_summary_buzz_instance(self, values):
summary_buzz_instance = dict()
summary_buzz_instance[BUZZ_KEY[DATE]] = values[0]
summary_buzz_instance[BUZZ_KEY[REPLY_DAY]] = values[1][BUZZ_KEY[DAY]]
summary_buzz_instance[BUZZ_KEY[REPLY_ACC]] = values[1][BUZZ_KEY[ACC]]
summary_buzz_instance[BUZZ_KEY[LIKE_DAY]] = values[2][BUZZ_KEY[DAY]]
summary_buzz_instance[BUZZ_KEY[LIKE_ACC]] = values[2][BUZZ_KEY[ACC]]
return summary_buzz_instance
def make_dummy_buzzs(self, start_date, end_date):
startdate = datetime.datetime.strptime(start_date, '%Y%m%d')
enddate = datetime.datetime.strptime(end_date, '%Y%m%d')
buzzs = []
while startdate <= enddate:
buzz_instance = self.make_base_buzz_instance([startdate.strftime('%Y%m%d'), 0, 0])
buzzs.append(buzz_instance)
startdate += datetime.timedelta(days=1)
return buzzs
def put_today_buzz(self, buzzs, today_acc_buzz_count):
today = datetime.date.today().strftime('%Y%m%d')
today_buzz_count = today_acc_buzz_count - buzzs[-2][BUZZ_KEY[ACC]]
# if today_buzz_count < 0:
# today_buzz_count = 0
result_buzzs = buzzs.copy()
result_buzzs[-1][BUZZ_KEY[DAY]] = today_buzz_count if today_buzz_count >=0 else 0
result_buzzs[-1][BUZZ_KEY[ACC]] = result_buzzs[-2][BUZZ_KEY[ACC]] + today_buzz_count
return result_buzzs
def fill_buzzs_into_dummy(self, buzzs, dummy):
buzzs_clone = buzzs.copy()
dummy_clone = dummy.copy()
for dummy_buzz, real_buzz in zip(dummy_clone, buzzs_clone):
dummy_buzz[BUZZ_KEY[DATE]] = real_buzz[BUZZ_KEY[DATE]]
dummy_buzz[BUZZ_KEY[DAY]] = real_buzz[BUZZ_KEY[DAY]]
dummy_buzz[BUZZ_KEY[ACC]] = real_buzz[BUZZ_KEY[ACC]]
for index, dummy_buzz in enumerate(dummy_clone):
previous_index = index - 1
previous_acc_value = dummy_clone[previous_index][BUZZ_KEY[ACC]]
current_acc_value = dummy_buzz[BUZZ_KEY[ACC]]
if previous_acc_value > 0 and current_acc_value == 0 and previous_index >= 0:
dummy_buzz[BUZZ_KEY[ACC]] = previous_acc_value
return dummy_clone
def get_buzzs(self, buzzs, buzz_type):
result_buzzs = []
if buzz_type == LIKE:
for buzz in buzzs:
buzz_instance = self.make_base_buzz_instance([
buzz[BUZZ_KEY[DATE]],
buzz[BUZZ_KEY[LIKE_DAY]],
buzz[BUZZ_KEY[LIKE_ACC]]
])
result_buzzs.append(buzz_instance)
return result_buzzs
def is_valid_data(self, reply_buzzs, like_buzzs):
reply_dates = self.get_date_list(reply_buzzs)
like_dates = self.get_date_list(like_buzzs)
if reply_dates == like_dates:
return True
else:
return False
def summary_reply_and_like(self, reply_buzzs, like_buzzs):
# if self.is_valid_data(reply_buzzs, like_buzzs) == False:
# raise IndexError("")
summary_buzzs = []
for reply_buzz, like_buzz in zip(reply_buzzs, like_buzzs):
date = reply_buzz[BUZZ_KEY[DATE]]
summary_buzz_instance = self.make_summary_buzz_instance([
date,
reply_buzz,
like_buzz
])
summary_buzzs.append(summary_buzz_instance)
return summary_buzzs

View File

@@ -37,6 +37,17 @@ class ResultSender:
return query + ",".join(key_list) + ") values (" + ",".join(val_list) + ")" + " on duplicate key update " + \ return query + ",".join(key_list) + ") values (" + ",".join(val_list) + ")" + " on duplicate key update " + \
','.join(map(lambda x:x[0] + '=' + x[1], zip(key_list, val_list))) ','.join(map(lambda x:x[0] + '=' + x[1], zip(key_list, val_list)))
def get_buzz(self, event_num):
query = 'select replybuzz from stats_s1_effect where event_num = ' + str(event_num)
if not self.conn.open:
self.connect()
with self.conn.cursor() as cursor:
cursor.execute(query)
buzz = cursor.fetchone()
return buzz['replybuzz'] if buzz != None else buzz
def send(self, table_name, dictionary): def send(self, table_name, dictionary):
query = self._make_query(table_name, dictionary) query = self._make_query(table_name, dictionary)
self._exec_query(query) self._exec_query(query)

View File

@@ -1,6 +1,7 @@
import effect.effectinstagram import effect.effectinstagram
import effect.effecterror import effect.effecterror
import effect.effectkakaostory import effect.effectkakaostory
from effect.InstaUrlValidator import InstaUrlValidator
from base.baseclasses import printl from base.baseclasses import printl
import sys import sys
import base.baseclasses import base.baseclasses
@@ -33,9 +34,17 @@ def get_browser_info(platform_, file_name="browser.txt"):
return options.get(platform_, options['default']) return options.get(platform_, options['default'])
def get_effect_process(platform_, event_num, url): def get_effect_process(platform_, event_num, url, start_date):
if platform_ == 'instagram': if platform_ == 'instagram':
return effect.effectinstagram.EffectInsta(int(event_num), int(event_num), url) try:
insta_url_validator = InstaUrlValidator(url)
insta_url = insta_url_validator.get_insta_url()
except Exception as e:
printl("x!@#!@#!@#e010!@#check url")
exit(1)
# return effect.effectinstagram.EffectInsta(int(event_num), int(event_num), url, start_date)
return effect.effectinstagram.EffectInsta(int(event_num), int(event_num), insta_url, start_date)
else: else:
browser_info = get_browser_info(platform_) browser_info = get_browser_info(platform_)
@@ -52,14 +61,19 @@ if __name__ == '__main__':
sys.argv[1] instagram, kakaostory, facebook sys.argv[1] instagram, kakaostory, facebook
sys.argv[2] event_num sys.argv[2] event_num
sys.argv[3] url sys.argv[3] url
sys.argv[4] start date
""" """
if len(sys.argv) != 4: # if len(sys.argv) != 4:
# printl("x!@#!@#!@#e010!@#check argument")
# exit(1)
if len(sys.argv) != 5:
printl("x!@#!@#!@#e010!@#check argument") printl("x!@#!@#!@#e010!@#check argument")
exit(1) exit(1)
try: try:
effect_process = get_effect_process(sys.argv[1], sys.argv[2], sys.argv[3]) effect_process = get_effect_process(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4])
effect_process.start() effect_process.start()
except effect.effecterror.EffectException as e: except effect.effecterror.EffectException as e:
printl("x!@#" + str(sys.argv[2]) + "!@#" + str(sys.argv[3]) + "!@#" + str(e)) printl("x!@#" + str(sys.argv[2]) + "!@#" + str(sys.argv[3]) + "!@#" + str(e))

View File

@@ -0,0 +1,91 @@
import time
from selenium.common.exceptions import WebDriverException
from base.baseclasses import find_element_by_css_selector
from base.baseclasses import find_elements_by_css_selector
from bs4 import BeautifulSoup
try:
import lxml
parser_opt = 'lxml'
except ImportError:
parser_opt = 'html.parser'
limit_reload = 5
list_tag_css_selector = "div#initial_browse_result"
list_page_css_selector = "div#pagelet_timeline_main_column"
list_group_css_selector = "div#pagelet_group_"
each_post_css_selector = "div._4-u2._4-u8"
wait_second_for_find_element = 30
class ListBase(object):
def __init__(self, driver):
self.driver = driver
self.url_list = []
self.list_css_selector = None
self.list_container_dom = None
self.current_post = None
def set_url_elements(self):
elements = find_element_by_css_selector(self.driver,
self.list_css_selector + " " + each_post_css_selector,
wait_second_for_find_element)
self.url_list.extend(elements)
def move_first(self):
self.url_list = self.current_post.pop(0) if self.url_list else None
def move_next(self):
self.move_first()
def check_list_and_load(self):
for _ in range(limit_reload):
num_of_list = len(self.url_list)
if num_of_list < 2:
self.load_more_list()
num_of_list = self.get_num_of_list()
if not num_of_list:
raise WebDriverException("There is no data or ajax error")
def load_more_list(self):
position = self.driver.get_window_position()
size = self.driver.get_window_size()
self.driver.maximize_window()
self.driver.set_window_size(size['width'], size["height"])
self.driver.set_window_position(position['x'], position['y'])
for _ in range(2):
self.driver.execute_script("window.scrollBy(0, -400)")
time.sleep(0.3)
for _ in range(4):
self.driver.execute_script("window.scrollBy(0, 800)")
time.sleep(0.3)
def has_next(self):
raise NotImplementedError
def get_url(self):
raise NotImplementedError
def get_date(self):
raise NotImplementedError
def remove_current_post(self):
css_selector = "div#" + self.current_post.id
self.driver.execute_script('document.querySelector("' + css_selector + '").remove()')
def get_num_of_list(self):
raise NotImplementedError
class ListTag(ListBase):
def __init__(self, driver):
super().__init__(driver)
self.list_css_selector = list_tag_css_selector
class ListPage(ListBase):
def __init__(self, driver):
self.driver = driver
self.list_css_selector = list_page_css_selector

View File

@@ -0,0 +1,197 @@
#-*- coding: utf-8 -*-
import logging
import re
import json
import datetime
import time
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import WebDriverException
from bs4 import BeautifulSoup
from base.baseclasses import SendtoDB
from base.baseclasses import print_and_flush
from base.baseclasses import CrawlInit
from base.baseclasses import wait
from base.baseclasses import find_element_by_css_selector
from base.baseclasses import find_elements_by_css_selector
from base.baseclasses import find_elements_by_xpath
from base.baseclasses import enter_element
from base.baseclasses import Browser
facebook_url = "http://bigbird.iptime.org/fbtest.php"
facebook_tag_url = "https://www.facebook.com/hashtag/"
facebook_id = 'concepters22@gmail.com'
facebook_password = 'zjstpqxjtm'
class FacebookInit(CrawlInit):
def __init__(self, before_day=0):
super().__init__(before_day)
self.urls = dict()
self.urls[11] = facebook_tag_url
self.urls[12] = facebook_url
def split_searches(self):
search = self.searches()
splited_list = search.split(',')
return [x.strip() if self.platform() == 12 else self.utf8(x) for x in splited_list]
# trimmed_list = list()
# if self.platform() == 12:
# for x in splited_list:
# trimmed_list.append(x.strip())
# else:
# for x in splited_list:
# trimmed_list.append(self.utf8(x))
# return trimmed_list
def make_url(self):
return [self.urls[self.platform()] + ('profile.php?id=' + x if x.isnumeric() else x) + "?fref=ts"
for x in self.split_searches()]
# return [self.urls[self.platform()] + x for x in self.split_searches()]
# urls = list()
# for x in self.split_searches():
# url = self.urls[self.platform()] + x + "?fref=ts"
# urls.append(url)
# return urls
def get_begin_day(self):
if self.is_realtime():
date_now = datetime.datetime.now()
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
result += datetime.timedelta(days=self.before_day)
return result
else:
return self.start_day()
def get_end_day(self):
if self.is_realtime():
date_now = datetime.datetime.now()
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
return result
else:
return self.end_day()
def is_hashtag(self):
return False if self.platform() == 12 else True
class FacebookMainCrawler:
def __init__(self):
self.crawl_init = FacebookInit()
self.browser = Browser()
self.driver = None
self.keyword_id = None
self.url = None
self.db_num = None
def set_driver(self, driver):
self.driver = driver
def set_keyword_id(self, keyword_id):
self.keyword_id = keyword_id
def start(self):
self.crawl_start()
def set_arguments(self, browser, keyword_id, db_num, before_day, until_page):
self.init_browser(browser)
self.init_keyword_id(keyword_id)
self.init_db(db_num)
self.init_before_day(before_day)
self.init_until_page(until_page)
def init_browser(self, browser):
self.set_driver(self.browser.get_new_driver(browser))
def init_keyword_id(self, keyword_id):
self.keyword_id = int(keyword_id) if type(keyword_id) != int else keyword_id
self.crawl_init.get_keyword_parameters(keyword_id)
self.crawl_init.disconnect()
def init_db(self, db_num):
self.db_num = db_num
def init_before_day(self, before_day):
self.crawl_init.set_before_day(before_day)
def init_until_page(self, until_page):
self.crawl_init.set_until_page(until_page)
def set_main_window_handler(self, window_handler):
self.main_window_handler = window_handler
def crawl_start(self):
real_time = True
while real_time:
print_and_flush("Crawler Start")
url_list = self.crawl_init.make_url()
i = 0
backup_set = set()
while i < len(url_list):
try:
self.set_main_window_handler(self.driver.window_handles[0])
print_and_flush(url_list[i] + "\n")
self.driver.get(url_list[i])
wait(5)
self.facebook_login()
body = self.driver.find_element_by_tag_name('body')
self.click_element(body)
self.page_crawler.set_date(begin_date=self.crawl_init.get_begin_day(),
end_date=self.crawl_init.get_end_day())
self.crawl_all_current_url(backup_set)
i += 1
backup_set.clear()
except Exception as e:
logging.info(e)
self.driver.quit()
self.set_driver(self.browser.new_browser())
wait(5)
real_time = self.crawl_init.is_realtime()
print_and_flush("Finished Crawling :)")
self.driver.quit()
def go_bigbird(self, driver):
driver.get(facebook_url)
def click_facebook_login(self, driver):
element_a = find_element_by_css_selector(driver, "a[href]", 15)
enter_element(element_a)
def login_facebook(self, driver, f_id, f_pw):
element_email = find_element_by_css_selector(driver, "input#email", 15)
element_password = find_element_by_css_selector(driver, "input#pass", 15)
element_button = find_element_by_css_selector(driver, "button#loginbutton", 15)
element_email.send_keys(f_id)
element_password.send_keys(f_pw)
enter_element(element_button)
def facebook_login(self):
try:
element_email = find_element_by_css_selector(self.driver, '#email', 15)
element_pwd = find_element_by_css_selector(self.driver, '#pass', 15)
except:
return
email = 'concepters22@gmail.com'
password = 'zjstpqxjtm'
element_email.send_keys(email)
element_pwd.send_keys(password)
label = self.driver.find_element_by_css_selector('#loginbutton')
element_input = label.find_element_by_xpath('input')
element_input.send_keys(Keys.NULL)
element_input.send_keys(Keys.ENTER)
wait(5)
def click_element(self, element):
ac = ActionChains(self.driver)
# ac.move_to_element_with_offset(element, 0, 0).click().perform()
ac.move_to_element(element).click().perform()
wait(4)

View File

@@ -314,7 +314,6 @@ def crawl_content_process(qu, keyword_id, db_num):
break break
ok = True ok = True
while ok: while ok:
time.sleep(2)
try: try:
# get a instance of InstaContent by do_no_proxy func. # get a instance of InstaContent by do_no_proxy func.
# if element['url'] is invalid, content is None # if element['url'] is invalid, content is None

View File

@@ -103,10 +103,10 @@ def parse_body_html(content):
start_cursor = None start_cursor = None
has_previous = False has_previous = False
if postpage: if postpage:
media = postpage[0]["media"] media = postpage[0]["graphql"]["shortcode_media"]
body = { body = {
"article_date": (old_date + datetime.timedelta(seconds=media["date"])).strftime("%Y-%m-%d %H:%M:%S"), "article_date": (old_date + datetime.timedelta(seconds=media["taken_at_timestamp"])).strftime("%Y-%m-%d %H:%M:%S"),
"article_data": media["caption"], "article_data": media["edge_media_to_caption"]["edges"][0]["node"]["text"],
"article_id": media["owner"]["username"], "article_id": media["owner"]["username"],
"article_nickname": media["owner"]["username"], "article_nickname": media["owner"]["username"],
"platform_id": media["owner"]["username"], "platform_id": media["owner"]["username"],
@@ -115,22 +115,22 @@ def parse_body_html(content):
"platform_title": media["owner"]["username"], "platform_title": media["owner"]["username"],
"article_form": "body", "article_form": "body",
"article_profileurl": media["owner"]["profile_pic_url"], "article_profileurl": media["owner"]["profile_pic_url"],
"article_order": str(media["comments"]["count"]), "article_order": str(media["edge_media_to_comment"]["count"]),
"article_hit": str(media.get('video_views', 0)), "article_hit": str(0),
"reply_url": str(media["likes"]["count"]) "reply_url": str(media["edge_media_preview_like"]["count"])
} }
comments = postpage[0]["media"]["comments"] comments = postpage[0]["graphql"]["shortcode_media"]["edge_media_to_comment"]
has_previous = comments["page_info"]["has_previous_page"] has_previous = comments["page_info"]["has_next_page"]
start_cursor = comments["page_info"]["start_cursor"] start_cursor = comments["page_info"]["end_cursor"]
nodes = comments["nodes"] nodes = comments["edges"]
for node in nodes: for node in nodes:
reply.append({ reply.append({
"article_data": node["text"], "article_data": node["node"]["text"],
"article_date": "article_date":
(old_date + datetime.timedelta(seconds=node["created_at"])).strftime("%Y-%m-%d %H:%M:%S"), (old_date + datetime.timedelta(seconds=node["node"]["created_at"])).strftime("%Y-%m-%d %H:%M:%S"),
"article_id": node["user"]["username"], "article_id": node["node"]["owner"]["username"],
"article_nickname": node["user"]["username"], "article_nickname": node["node"]["owner"]["username"],
"article_profileurl": node["user"]["profile_pic_url"], "article_profileurl": node["node"]["owner"]["profile_pic_url"],
"platform_name": "instagram", "platform_name": "instagram",
"platform_form": "post", "platform_form": "post",
"article_form": "reply" "article_form": "reply"