git-svn-id: svn://192.168.0.12/source@348 8346c931-da38-4b9b-9d4c-e48b93cbd075
This commit is contained in:
@@ -1,5 +1,11 @@
|
||||
[database]
|
||||
[#database]
|
||||
user=root
|
||||
pass=1234
|
||||
host=192.168.0.82
|
||||
name=bigbird
|
||||
|
||||
[database]
|
||||
user=admin
|
||||
pass=con2214lac!
|
||||
host=182.162.171.147
|
||||
name=bigbird
|
||||
86
WebBasedCrawler/effect/InstaUrlValidator.py
Normal file
86
WebBasedCrawler/effect/InstaUrlValidator.py
Normal file
@@ -0,0 +1,86 @@
|
||||
class InstaUrlValidator:
|
||||
def __init__(self, input_url):
|
||||
self.protocol = 'https'
|
||||
self.host = 'www.instagram.com'
|
||||
self.path1 = 'p'
|
||||
|
||||
self.input_user_key = ''
|
||||
self.input_url = input_url
|
||||
|
||||
def preprocess_input_url(self):
|
||||
if type(self.input_url) != str:
|
||||
raise TypeError('input url error')
|
||||
|
||||
self.preprocessed_input_url = self.input_url.strip()
|
||||
|
||||
def check_protocol(self):
|
||||
start_index = 0
|
||||
end_index = self.preprocessed_input_url.find(':')
|
||||
if end_index == -1:
|
||||
return start_index
|
||||
|
||||
if self.preprocessed_input_url[end_index+1] != '/' or self.preprocessed_input_url[end_index+2] != '/':
|
||||
raise ValueError('incorrect url format')
|
||||
|
||||
return end_index + 3
|
||||
|
||||
def check_host(self, start_index):
|
||||
end_index = self.preprocessed_input_url.find('/', start_index)
|
||||
if end_index == -1:
|
||||
raise ValueError('incorrect url format')
|
||||
|
||||
input_host = self.preprocessed_input_url[start_index:end_index]
|
||||
if input_host not in self.host:
|
||||
raise ValueError('incorrect host')
|
||||
|
||||
return end_index + 1
|
||||
|
||||
def check_path1(self, start_index):
|
||||
end_index = self.preprocessed_input_url.find('/', start_index)
|
||||
if end_index == -1:
|
||||
raise ValueError('incorrect path')
|
||||
|
||||
input_path1 = self.preprocessed_input_url[start_index:end_index]
|
||||
if input_path1 != self.path1:
|
||||
raise ValueError('incorrect path (/p/)')
|
||||
|
||||
return end_index + 1
|
||||
|
||||
def check_path2(self, start_index):
|
||||
end_index = self.preprocessed_input_url.find('/', start_index)
|
||||
# if end_index == -1:
|
||||
# raise ValueError('incorrect path')
|
||||
#
|
||||
# self.input_user_key = self.preprocessed_input_url[start_index:end_index]
|
||||
|
||||
if end_index != -1:
|
||||
self.input_user_key = self.preprocessed_input_url[start_index:end_index]
|
||||
else:
|
||||
self.input_user_key = self.preprocessed_input_url[start_index:]
|
||||
|
||||
def make_instagram_url(self):
|
||||
if len(self.input_user_key) <= 0:
|
||||
raise ValueError('incorrect user key')
|
||||
|
||||
url = self.protocol + '://' + self.host + '/' + self.path1 + '/' + self.input_user_key + '/'
|
||||
return url
|
||||
|
||||
def validate_url(self):
|
||||
try:
|
||||
self.preprocess_input_url()
|
||||
start_index = self.check_protocol()
|
||||
start_index = self.check_host(start_index)
|
||||
start_index = self.check_path1(start_index)
|
||||
self.check_path2(start_index)
|
||||
|
||||
except Exception as e:
|
||||
raise e
|
||||
|
||||
def get_insta_url(self):
|
||||
try:
|
||||
self.validate_url()
|
||||
url = self.make_instagram_url()
|
||||
except Exception as e:
|
||||
raise e
|
||||
|
||||
return url
|
||||
@@ -36,6 +36,20 @@ insta_tag_url = "https://www.instagram.com/explore/tags/"
|
||||
insta_query = "https://www.instagram.com/query/"
|
||||
insta_body_url = 'https://www.instagram.com/p/'
|
||||
|
||||
DATE = 0
|
||||
REPLY_DAY = 1
|
||||
REPLY_ACC = 2
|
||||
LIKE_DAY = 3
|
||||
LIKE_ACC = 4
|
||||
DAY = 5
|
||||
ACC = 6
|
||||
REPLY = 7
|
||||
LIKE = 8
|
||||
|
||||
BUZZ_KEY = [
|
||||
"date", "reply_day", "reply_acc", "like_day", "like_acc", "day", "acc"
|
||||
]
|
||||
|
||||
|
||||
def requests_get(req, timeout=requests_timeout):
|
||||
body = []
|
||||
@@ -136,10 +150,25 @@ class InstaContent:
|
||||
|
||||
|
||||
class EffectInsta(object):
|
||||
def __init__(self, event_num, event_code, url):
|
||||
|
||||
def __init__(self, event_num, event_code, url, start_date):
|
||||
self.event_num = event_num
|
||||
self.event_code = event_code
|
||||
self.url = url
|
||||
self.start_date = start_date.replace("-", "")
|
||||
self.database = self.database_init()
|
||||
|
||||
|
||||
def database_init(self):
|
||||
try:
|
||||
cg = get_settings()
|
||||
except Exception as e:
|
||||
raise effect.effecterror.UnknownError(str(e) + '\n' + 'effect.ini setting error')
|
||||
|
||||
database = ResultSender(cg['host'], cg['user'], cg['pass'], cg['name']) if cg else ResultSender()
|
||||
database.connect()
|
||||
|
||||
return database
|
||||
|
||||
def start(self):
|
||||
#content = insta.instacrawl.InstaContent(self.url, {}, self.url)
|
||||
@@ -187,8 +216,12 @@ class EffectInsta(object):
|
||||
result['replycount'] = int(body.get('article_order'), 0)
|
||||
result['likecount'] = int(body.get('reply_url'), 0)
|
||||
result['interactioncount'] = self.get_replycount(body, replies)
|
||||
result['replybuzz'] = self.get_reply_buzz(body, replies)
|
||||
replybuzz = self.get_reply_buzz(body, replies)
|
||||
likebuzzs = self.get_like_buzz(int(body.get('reply_url'), 0))
|
||||
totalbuzz = self.summary_reply_and_like(replybuzz, likebuzzs)
|
||||
result['replybuzz'] = json.dumps(totalbuzz, sort_keys=True)
|
||||
result['engagementcount'] = result.get('likecount', 0) + result.get('replycount', 0)
|
||||
|
||||
return result
|
||||
|
||||
def get_replycount(self, body, replies):
|
||||
@@ -197,22 +230,171 @@ class EffectInsta(object):
|
||||
set_reply_id.add(i.get('article_id', ''))
|
||||
return len(set_reply_id) - 1 if body.get('article_id') in set_reply_id else len(set_reply_id)
|
||||
|
||||
# def get_reply_buzz(self, body, replies):
|
||||
# start_date = datetime.datetime.strptime(body['article_date'], '%Y-%m-%d %H:%M:%S').date()
|
||||
# end_date = datetime.datetime.now().date()
|
||||
# date_dict = dict()
|
||||
# while start_date <= end_date:
|
||||
# date_dict[start_date.strftime('%Y%m%d')] = 0
|
||||
# start_date = start_date + datetime.timedelta(days=1)
|
||||
#
|
||||
# for reply in replies:
|
||||
# str_reply_date = reply.get('article_date', '1990-01-01 00:00:00')
|
||||
# reply_date = datetime.datetime.strptime(str_reply_date, '%Y-%m-%d %H:%M:%S').date().strftime('%Y%m%d')
|
||||
# print(reply_date)
|
||||
# if reply_date in date_dict:
|
||||
# date_dict[reply_date] = date_dict[reply_date] + 1
|
||||
#
|
||||
# print(date_dict)
|
||||
#
|
||||
# json_array = [{'date': k, 'value': v} for k, v in date_dict.items()]
|
||||
#
|
||||
# return json.dumps(json_array, sort_keys=True)
|
||||
|
||||
def get_reply_buzz(self, body, replies):
|
||||
start_date = datetime.datetime.strptime(body['article_date'], '%Y-%m-%d %H:%M:%S').date()
|
||||
end_date = datetime.datetime.now().date()
|
||||
start_date = datetime.datetime.strptime(self.start_date, '%Y%m%d').date()
|
||||
today = datetime.datetime.now().date()
|
||||
|
||||
date_dict = dict()
|
||||
while start_date <= end_date:
|
||||
while start_date <= today:
|
||||
date_dict[start_date.strftime('%Y%m%d')] = 0
|
||||
start_date = start_date + datetime.timedelta(days=1)
|
||||
|
||||
for reply in replies:
|
||||
str_reply_date = reply.get('article_date', '1990-01-01 00:00:00')
|
||||
reply_date = datetime.datetime.strptime(str_reply_date, '%Y-%m-%d %H:%M:%S').date().strftime('%m-%d-%Y')
|
||||
str_reply_date = reply.get('article_date')
|
||||
reply_date = datetime.datetime.strptime(str_reply_date, '%Y-%m-%d %H:%M:%S').date().strftime('%Y%m%d')
|
||||
if reply_date in date_dict:
|
||||
date_dict[reply_date] = date_dict[reply_date] + 1
|
||||
|
||||
json_array = [{'date': k, 'value': v} for k, v in date_dict.items()]
|
||||
reply_buzzs = self.make_dummy_buzzs(self.start_date, datetime.datetime.today().strftime('%Y%m%d'))
|
||||
reply_acc_count = 0
|
||||
for reply_buzz in reply_buzzs:
|
||||
date = reply_buzz[BUZZ_KEY[DATE]]
|
||||
reply_count = date_dict[date]
|
||||
reply_acc_count += reply_count
|
||||
reply_buzz[BUZZ_KEY[DAY]] = date_dict[date]
|
||||
reply_buzz[BUZZ_KEY[ACC]] = reply_acc_count
|
||||
|
||||
return json.dumps(json_array, sort_keys=True)
|
||||
# json_array = [{'date': k, 'value': v} for k, v in date_dict.items()]
|
||||
# return json.dumps(json_array, sort_keys=True)
|
||||
return reply_buzzs
|
||||
|
||||
def get_like_buzz(self, like_count):
|
||||
start_date = datetime.datetime.strptime(self.start_date, '%Y%m%d').date()
|
||||
today = datetime.datetime.today().strftime('%Y%m%d')
|
||||
|
||||
try:
|
||||
buzzs = self.database.get_buzz(self.event_num)
|
||||
if buzzs != None:
|
||||
buzzs = json.loads(buzzs)
|
||||
else:
|
||||
buzzs = []
|
||||
buzzs = self.get_buzzs(buzzs, LIKE)
|
||||
like_dummy_buzzs = self.make_dummy_buzzs(self.start_date, today)
|
||||
like_buzzs = self.fill_buzzs_into_dummy(buzzs, like_dummy_buzzs)
|
||||
like_buzzs = self.put_today_buzz(like_buzzs, like_count)
|
||||
except Exception as e:
|
||||
raise effect.effecterror.DBQueryError(str(e))
|
||||
|
||||
return like_buzzs
|
||||
|
||||
def make_base_buzz_instance(self, values):
|
||||
base_buzz_instance = dict()
|
||||
base_buzz_instance[BUZZ_KEY[DATE]] = values[0]
|
||||
base_buzz_instance[BUZZ_KEY[DAY]] = values[1]
|
||||
base_buzz_instance[BUZZ_KEY[ACC]] = values[2]
|
||||
|
||||
return base_buzz_instance
|
||||
|
||||
def make_summary_buzz_instance(self, values):
|
||||
summary_buzz_instance = dict()
|
||||
summary_buzz_instance[BUZZ_KEY[DATE]] = values[0]
|
||||
summary_buzz_instance[BUZZ_KEY[REPLY_DAY]] = values[1][BUZZ_KEY[DAY]]
|
||||
summary_buzz_instance[BUZZ_KEY[REPLY_ACC]] = values[1][BUZZ_KEY[ACC]]
|
||||
summary_buzz_instance[BUZZ_KEY[LIKE_DAY]] = values[2][BUZZ_KEY[DAY]]
|
||||
summary_buzz_instance[BUZZ_KEY[LIKE_ACC]] = values[2][BUZZ_KEY[ACC]]
|
||||
|
||||
return summary_buzz_instance
|
||||
|
||||
def make_dummy_buzzs(self, start_date, end_date):
|
||||
|
||||
startdate = datetime.datetime.strptime(start_date, '%Y%m%d')
|
||||
enddate = datetime.datetime.strptime(end_date, '%Y%m%d')
|
||||
|
||||
buzzs = []
|
||||
while startdate <= enddate:
|
||||
buzz_instance = self.make_base_buzz_instance([startdate.strftime('%Y%m%d'), 0, 0])
|
||||
buzzs.append(buzz_instance)
|
||||
|
||||
startdate += datetime.timedelta(days=1)
|
||||
|
||||
return buzzs
|
||||
|
||||
def put_today_buzz(self, buzzs, today_acc_buzz_count):
|
||||
today = datetime.date.today().strftime('%Y%m%d')
|
||||
today_buzz_count = today_acc_buzz_count - buzzs[-2][BUZZ_KEY[ACC]]
|
||||
# if today_buzz_count < 0:
|
||||
# today_buzz_count = 0
|
||||
|
||||
result_buzzs = buzzs.copy()
|
||||
result_buzzs[-1][BUZZ_KEY[DAY]] = today_buzz_count if today_buzz_count >=0 else 0
|
||||
result_buzzs[-1][BUZZ_KEY[ACC]] = result_buzzs[-2][BUZZ_KEY[ACC]] + today_buzz_count
|
||||
|
||||
return result_buzzs
|
||||
|
||||
def fill_buzzs_into_dummy(self, buzzs, dummy):
|
||||
buzzs_clone = buzzs.copy()
|
||||
dummy_clone = dummy.copy()
|
||||
|
||||
for dummy_buzz, real_buzz in zip(dummy_clone, buzzs_clone):
|
||||
dummy_buzz[BUZZ_KEY[DATE]] = real_buzz[BUZZ_KEY[DATE]]
|
||||
dummy_buzz[BUZZ_KEY[DAY]] = real_buzz[BUZZ_KEY[DAY]]
|
||||
dummy_buzz[BUZZ_KEY[ACC]] = real_buzz[BUZZ_KEY[ACC]]
|
||||
|
||||
for index, dummy_buzz in enumerate(dummy_clone):
|
||||
previous_index = index - 1
|
||||
previous_acc_value = dummy_clone[previous_index][BUZZ_KEY[ACC]]
|
||||
current_acc_value = dummy_buzz[BUZZ_KEY[ACC]]
|
||||
|
||||
if previous_acc_value > 0 and current_acc_value == 0 and previous_index >= 0:
|
||||
dummy_buzz[BUZZ_KEY[ACC]] = previous_acc_value
|
||||
|
||||
return dummy_clone
|
||||
|
||||
def get_buzzs(self, buzzs, buzz_type):
|
||||
result_buzzs = []
|
||||
if buzz_type == LIKE:
|
||||
for buzz in buzzs:
|
||||
buzz_instance = self.make_base_buzz_instance([
|
||||
buzz[BUZZ_KEY[DATE]],
|
||||
buzz[BUZZ_KEY[LIKE_DAY]],
|
||||
buzz[BUZZ_KEY[LIKE_ACC]]
|
||||
])
|
||||
result_buzzs.append(buzz_instance)
|
||||
|
||||
return result_buzzs
|
||||
|
||||
def is_valid_data(self, reply_buzzs, like_buzzs):
|
||||
reply_dates = self.get_date_list(reply_buzzs)
|
||||
like_dates = self.get_date_list(like_buzzs)
|
||||
|
||||
if reply_dates == like_dates:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def summary_reply_and_like(self, reply_buzzs, like_buzzs):
|
||||
# if self.is_valid_data(reply_buzzs, like_buzzs) == False:
|
||||
# raise IndexError("")
|
||||
|
||||
summary_buzzs = []
|
||||
for reply_buzz, like_buzz in zip(reply_buzzs, like_buzzs):
|
||||
date = reply_buzz[BUZZ_KEY[DATE]]
|
||||
summary_buzz_instance = self.make_summary_buzz_instance([
|
||||
date,
|
||||
reply_buzz,
|
||||
like_buzz
|
||||
])
|
||||
summary_buzzs.append(summary_buzz_instance)
|
||||
|
||||
return summary_buzzs
|
||||
@@ -37,6 +37,17 @@ class ResultSender:
|
||||
return query + ",".join(key_list) + ") values (" + ",".join(val_list) + ")" + " on duplicate key update " + \
|
||||
','.join(map(lambda x:x[0] + '=' + x[1], zip(key_list, val_list)))
|
||||
|
||||
def get_buzz(self, event_num):
|
||||
query = 'select replybuzz from stats_s1_effect where event_num = ' + str(event_num)
|
||||
|
||||
if not self.conn.open:
|
||||
self.connect()
|
||||
with self.conn.cursor() as cursor:
|
||||
cursor.execute(query)
|
||||
buzz = cursor.fetchone()
|
||||
|
||||
return buzz['replybuzz'] if buzz != None else buzz
|
||||
|
||||
def send(self, table_name, dictionary):
|
||||
query = self._make_query(table_name, dictionary)
|
||||
self._exec_query(query)
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import effect.effectinstagram
|
||||
import effect.effecterror
|
||||
import effect.effectkakaostory
|
||||
from effect.InstaUrlValidator import InstaUrlValidator
|
||||
from base.baseclasses import printl
|
||||
import sys
|
||||
import base.baseclasses
|
||||
@@ -33,9 +34,17 @@ def get_browser_info(platform_, file_name="browser.txt"):
|
||||
return options.get(platform_, options['default'])
|
||||
|
||||
|
||||
def get_effect_process(platform_, event_num, url):
|
||||
def get_effect_process(platform_, event_num, url, start_date):
|
||||
if platform_ == 'instagram':
|
||||
return effect.effectinstagram.EffectInsta(int(event_num), int(event_num), url)
|
||||
try:
|
||||
insta_url_validator = InstaUrlValidator(url)
|
||||
insta_url = insta_url_validator.get_insta_url()
|
||||
except Exception as e:
|
||||
printl("x!@#!@#!@#e010!@#check url")
|
||||
exit(1)
|
||||
|
||||
# return effect.effectinstagram.EffectInsta(int(event_num), int(event_num), url, start_date)
|
||||
return effect.effectinstagram.EffectInsta(int(event_num), int(event_num), insta_url, start_date)
|
||||
|
||||
else:
|
||||
browser_info = get_browser_info(platform_)
|
||||
@@ -52,14 +61,19 @@ if __name__ == '__main__':
|
||||
sys.argv[1] instagram, kakaostory, facebook
|
||||
sys.argv[2] event_num
|
||||
sys.argv[3] url
|
||||
sys.argv[4] start date
|
||||
"""
|
||||
|
||||
if len(sys.argv) != 4:
|
||||
# if len(sys.argv) != 4:
|
||||
# printl("x!@#!@#!@#e010!@#check argument")
|
||||
# exit(1)
|
||||
|
||||
if len(sys.argv) != 5:
|
||||
printl("x!@#!@#!@#e010!@#check argument")
|
||||
exit(1)
|
||||
|
||||
try:
|
||||
effect_process = get_effect_process(sys.argv[1], sys.argv[2], sys.argv[3])
|
||||
effect_process = get_effect_process(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4])
|
||||
effect_process.start()
|
||||
except effect.effecterror.EffectException as e:
|
||||
printl("x!@#" + str(sys.argv[2]) + "!@#" + str(sys.argv[3]) + "!@#" + str(e))
|
||||
|
||||
91
WebBasedCrawler/facebook/facebookcrawl_new.py
Normal file
91
WebBasedCrawler/facebook/facebookcrawl_new.py
Normal file
@@ -0,0 +1,91 @@
|
||||
import time
|
||||
from selenium.common.exceptions import WebDriverException
|
||||
from base.baseclasses import find_element_by_css_selector
|
||||
from base.baseclasses import find_elements_by_css_selector
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
try:
|
||||
import lxml
|
||||
parser_opt = 'lxml'
|
||||
except ImportError:
|
||||
parser_opt = 'html.parser'
|
||||
|
||||
limit_reload = 5
|
||||
|
||||
list_tag_css_selector = "div#initial_browse_result"
|
||||
list_page_css_selector = "div#pagelet_timeline_main_column"
|
||||
list_group_css_selector = "div#pagelet_group_"
|
||||
each_post_css_selector = "div._4-u2._4-u8"
|
||||
wait_second_for_find_element = 30
|
||||
|
||||
|
||||
class ListBase(object):
|
||||
def __init__(self, driver):
|
||||
self.driver = driver
|
||||
self.url_list = []
|
||||
self.list_css_selector = None
|
||||
self.list_container_dom = None
|
||||
self.current_post = None
|
||||
|
||||
def set_url_elements(self):
|
||||
elements = find_element_by_css_selector(self.driver,
|
||||
self.list_css_selector + " " + each_post_css_selector,
|
||||
wait_second_for_find_element)
|
||||
self.url_list.extend(elements)
|
||||
|
||||
def move_first(self):
|
||||
self.url_list = self.current_post.pop(0) if self.url_list else None
|
||||
|
||||
def move_next(self):
|
||||
self.move_first()
|
||||
|
||||
def check_list_and_load(self):
|
||||
for _ in range(limit_reload):
|
||||
num_of_list = len(self.url_list)
|
||||
if num_of_list < 2:
|
||||
self.load_more_list()
|
||||
num_of_list = self.get_num_of_list()
|
||||
if not num_of_list:
|
||||
raise WebDriverException("There is no data or ajax error")
|
||||
|
||||
def load_more_list(self):
|
||||
position = self.driver.get_window_position()
|
||||
size = self.driver.get_window_size()
|
||||
self.driver.maximize_window()
|
||||
self.driver.set_window_size(size['width'], size["height"])
|
||||
self.driver.set_window_position(position['x'], position['y'])
|
||||
for _ in range(2):
|
||||
self.driver.execute_script("window.scrollBy(0, -400)")
|
||||
time.sleep(0.3)
|
||||
for _ in range(4):
|
||||
self.driver.execute_script("window.scrollBy(0, 800)")
|
||||
time.sleep(0.3)
|
||||
|
||||
def has_next(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def get_url(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def get_date(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def remove_current_post(self):
|
||||
css_selector = "div#" + self.current_post.id
|
||||
self.driver.execute_script('document.querySelector("' + css_selector + '").remove()')
|
||||
|
||||
def get_num_of_list(self):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class ListTag(ListBase):
|
||||
def __init__(self, driver):
|
||||
super().__init__(driver)
|
||||
self.list_css_selector = list_tag_css_selector
|
||||
|
||||
|
||||
class ListPage(ListBase):
|
||||
def __init__(self, driver):
|
||||
self.driver = driver
|
||||
self.list_css_selector = list_page_css_selector
|
||||
|
||||
197
WebBasedCrawler/facebook/facebookcrawltemp.py
Normal file
197
WebBasedCrawler/facebook/facebookcrawltemp.py
Normal file
@@ -0,0 +1,197 @@
|
||||
#-*- coding: utf-8 -*-
|
||||
|
||||
import logging
|
||||
import re
|
||||
import json
|
||||
import datetime
|
||||
import time
|
||||
|
||||
|
||||
from selenium.webdriver.common.keys import Keys
|
||||
from selenium.webdriver.common.action_chains import ActionChains
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.common.exceptions import WebDriverException
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from base.baseclasses import SendtoDB
|
||||
from base.baseclasses import print_and_flush
|
||||
from base.baseclasses import CrawlInit
|
||||
from base.baseclasses import wait
|
||||
from base.baseclasses import find_element_by_css_selector
|
||||
from base.baseclasses import find_elements_by_css_selector
|
||||
from base.baseclasses import find_elements_by_xpath
|
||||
from base.baseclasses import enter_element
|
||||
from base.baseclasses import Browser
|
||||
|
||||
facebook_url = "http://bigbird.iptime.org/fbtest.php"
|
||||
facebook_tag_url = "https://www.facebook.com/hashtag/"
|
||||
|
||||
facebook_id = 'concepters22@gmail.com'
|
||||
facebook_password = 'zjstpqxjtm'
|
||||
|
||||
|
||||
class FacebookInit(CrawlInit):
|
||||
def __init__(self, before_day=0):
|
||||
super().__init__(before_day)
|
||||
self.urls = dict()
|
||||
self.urls[11] = facebook_tag_url
|
||||
self.urls[12] = facebook_url
|
||||
|
||||
def split_searches(self):
|
||||
search = self.searches()
|
||||
splited_list = search.split(',')
|
||||
return [x.strip() if self.platform() == 12 else self.utf8(x) for x in splited_list]
|
||||
# trimmed_list = list()
|
||||
# if self.platform() == 12:
|
||||
# for x in splited_list:
|
||||
# trimmed_list.append(x.strip())
|
||||
# else:
|
||||
# for x in splited_list:
|
||||
# trimmed_list.append(self.utf8(x))
|
||||
# return trimmed_list
|
||||
|
||||
def make_url(self):
|
||||
return [self.urls[self.platform()] + ('profile.php?id=' + x if x.isnumeric() else x) + "?fref=ts"
|
||||
for x in self.split_searches()]
|
||||
# return [self.urls[self.platform()] + x for x in self.split_searches()]
|
||||
# urls = list()
|
||||
# for x in self.split_searches():
|
||||
# url = self.urls[self.platform()] + x + "?fref=ts"
|
||||
# urls.append(url)
|
||||
# return urls
|
||||
|
||||
def get_begin_day(self):
|
||||
if self.is_realtime():
|
||||
date_now = datetime.datetime.now()
|
||||
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
|
||||
result += datetime.timedelta(days=self.before_day)
|
||||
return result
|
||||
else:
|
||||
return self.start_day()
|
||||
|
||||
def get_end_day(self):
|
||||
if self.is_realtime():
|
||||
date_now = datetime.datetime.now()
|
||||
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
|
||||
return result
|
||||
else:
|
||||
return self.end_day()
|
||||
|
||||
def is_hashtag(self):
|
||||
return False if self.platform() == 12 else True
|
||||
|
||||
|
||||
class FacebookMainCrawler:
|
||||
def __init__(self):
|
||||
self.crawl_init = FacebookInit()
|
||||
self.browser = Browser()
|
||||
self.driver = None
|
||||
self.keyword_id = None
|
||||
self.url = None
|
||||
self.db_num = None
|
||||
|
||||
def set_driver(self, driver):
|
||||
self.driver = driver
|
||||
|
||||
def set_keyword_id(self, keyword_id):
|
||||
self.keyword_id = keyword_id
|
||||
|
||||
def start(self):
|
||||
self.crawl_start()
|
||||
|
||||
def set_arguments(self, browser, keyword_id, db_num, before_day, until_page):
|
||||
self.init_browser(browser)
|
||||
self.init_keyword_id(keyword_id)
|
||||
self.init_db(db_num)
|
||||
self.init_before_day(before_day)
|
||||
self.init_until_page(until_page)
|
||||
|
||||
def init_browser(self, browser):
|
||||
self.set_driver(self.browser.get_new_driver(browser))
|
||||
|
||||
def init_keyword_id(self, keyword_id):
|
||||
self.keyword_id = int(keyword_id) if type(keyword_id) != int else keyword_id
|
||||
self.crawl_init.get_keyword_parameters(keyword_id)
|
||||
self.crawl_init.disconnect()
|
||||
|
||||
def init_db(self, db_num):
|
||||
self.db_num = db_num
|
||||
|
||||
def init_before_day(self, before_day):
|
||||
self.crawl_init.set_before_day(before_day)
|
||||
|
||||
def init_until_page(self, until_page):
|
||||
self.crawl_init.set_until_page(until_page)
|
||||
|
||||
def set_main_window_handler(self, window_handler):
|
||||
self.main_window_handler = window_handler
|
||||
|
||||
def crawl_start(self):
|
||||
real_time = True
|
||||
while real_time:
|
||||
print_and_flush("Crawler Start")
|
||||
url_list = self.crawl_init.make_url()
|
||||
i = 0
|
||||
backup_set = set()
|
||||
while i < len(url_list):
|
||||
try:
|
||||
self.set_main_window_handler(self.driver.window_handles[0])
|
||||
print_and_flush(url_list[i] + "\n")
|
||||
self.driver.get(url_list[i])
|
||||
wait(5)
|
||||
self.facebook_login()
|
||||
body = self.driver.find_element_by_tag_name('body')
|
||||
self.click_element(body)
|
||||
self.page_crawler.set_date(begin_date=self.crawl_init.get_begin_day(),
|
||||
end_date=self.crawl_init.get_end_day())
|
||||
self.crawl_all_current_url(backup_set)
|
||||
i += 1
|
||||
backup_set.clear()
|
||||
except Exception as e:
|
||||
logging.info(e)
|
||||
self.driver.quit()
|
||||
self.set_driver(self.browser.new_browser())
|
||||
wait(5)
|
||||
real_time = self.crawl_init.is_realtime()
|
||||
print_and_flush("Finished Crawling :)")
|
||||
self.driver.quit()
|
||||
|
||||
def go_bigbird(self, driver):
|
||||
driver.get(facebook_url)
|
||||
|
||||
def click_facebook_login(self, driver):
|
||||
element_a = find_element_by_css_selector(driver, "a[href]", 15)
|
||||
enter_element(element_a)
|
||||
|
||||
def login_facebook(self, driver, f_id, f_pw):
|
||||
element_email = find_element_by_css_selector(driver, "input#email", 15)
|
||||
element_password = find_element_by_css_selector(driver, "input#pass", 15)
|
||||
element_button = find_element_by_css_selector(driver, "button#loginbutton", 15)
|
||||
element_email.send_keys(f_id)
|
||||
element_password.send_keys(f_pw)
|
||||
enter_element(element_button)
|
||||
|
||||
def facebook_login(self):
|
||||
try:
|
||||
element_email = find_element_by_css_selector(self.driver, '#email', 15)
|
||||
element_pwd = find_element_by_css_selector(self.driver, '#pass', 15)
|
||||
except:
|
||||
return
|
||||
email = 'concepters22@gmail.com'
|
||||
password = 'zjstpqxjtm'
|
||||
element_email.send_keys(email)
|
||||
element_pwd.send_keys(password)
|
||||
label = self.driver.find_element_by_css_selector('#loginbutton')
|
||||
element_input = label.find_element_by_xpath('input')
|
||||
element_input.send_keys(Keys.NULL)
|
||||
element_input.send_keys(Keys.ENTER)
|
||||
wait(5)
|
||||
|
||||
def click_element(self, element):
|
||||
ac = ActionChains(self.driver)
|
||||
# ac.move_to_element_with_offset(element, 0, 0).click().perform()
|
||||
ac.move_to_element(element).click().perform()
|
||||
wait(4)
|
||||
|
||||
@@ -314,7 +314,6 @@ def crawl_content_process(qu, keyword_id, db_num):
|
||||
break
|
||||
ok = True
|
||||
while ok:
|
||||
time.sleep(2)
|
||||
try:
|
||||
# get a instance of InstaContent by do_no_proxy func.
|
||||
# if element['url'] is invalid, content is None
|
||||
|
||||
@@ -103,10 +103,10 @@ def parse_body_html(content):
|
||||
start_cursor = None
|
||||
has_previous = False
|
||||
if postpage:
|
||||
media = postpage[0]["media"]
|
||||
media = postpage[0]["graphql"]["shortcode_media"]
|
||||
body = {
|
||||
"article_date": (old_date + datetime.timedelta(seconds=media["date"])).strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"article_data": media["caption"],
|
||||
"article_date": (old_date + datetime.timedelta(seconds=media["taken_at_timestamp"])).strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"article_data": media["edge_media_to_caption"]["edges"][0]["node"]["text"],
|
||||
"article_id": media["owner"]["username"],
|
||||
"article_nickname": media["owner"]["username"],
|
||||
"platform_id": media["owner"]["username"],
|
||||
@@ -115,22 +115,22 @@ def parse_body_html(content):
|
||||
"platform_title": media["owner"]["username"],
|
||||
"article_form": "body",
|
||||
"article_profileurl": media["owner"]["profile_pic_url"],
|
||||
"article_order": str(media["comments"]["count"]),
|
||||
"article_hit": str(media.get('video_views', 0)),
|
||||
"reply_url": str(media["likes"]["count"])
|
||||
"article_order": str(media["edge_media_to_comment"]["count"]),
|
||||
"article_hit": str(0),
|
||||
"reply_url": str(media["edge_media_preview_like"]["count"])
|
||||
}
|
||||
comments = postpage[0]["media"]["comments"]
|
||||
has_previous = comments["page_info"]["has_previous_page"]
|
||||
start_cursor = comments["page_info"]["start_cursor"]
|
||||
nodes = comments["nodes"]
|
||||
comments = postpage[0]["graphql"]["shortcode_media"]["edge_media_to_comment"]
|
||||
has_previous = comments["page_info"]["has_next_page"]
|
||||
start_cursor = comments["page_info"]["end_cursor"]
|
||||
nodes = comments["edges"]
|
||||
for node in nodes:
|
||||
reply.append({
|
||||
"article_data": node["text"],
|
||||
"article_data": node["node"]["text"],
|
||||
"article_date":
|
||||
(old_date + datetime.timedelta(seconds=node["created_at"])).strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"article_id": node["user"]["username"],
|
||||
"article_nickname": node["user"]["username"],
|
||||
"article_profileurl": node["user"]["profile_pic_url"],
|
||||
(old_date + datetime.timedelta(seconds=node["node"]["created_at"])).strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"article_id": node["node"]["owner"]["username"],
|
||||
"article_nickname": node["node"]["owner"]["username"],
|
||||
"article_profileurl": node["node"]["owner"]["profile_pic_url"],
|
||||
"platform_name": "instagram",
|
||||
"platform_form": "post",
|
||||
"article_form": "reply"
|
||||
|
||||
Reference in New Issue
Block a user