git-svn-id: svn://192.168.0.12/source@348 8346c931-da38-4b9b-9d4c-e48b93cbd075
This commit is contained in:
@@ -1,5 +1,11 @@
|
|||||||
[database]
|
[#database]
|
||||||
user=root
|
user=root
|
||||||
pass=1234
|
pass=1234
|
||||||
host=192.168.0.82
|
host=192.168.0.82
|
||||||
|
name=bigbird
|
||||||
|
|
||||||
|
[database]
|
||||||
|
user=admin
|
||||||
|
pass=con2214lac!
|
||||||
|
host=182.162.171.147
|
||||||
name=bigbird
|
name=bigbird
|
||||||
86
WebBasedCrawler/effect/InstaUrlValidator.py
Normal file
86
WebBasedCrawler/effect/InstaUrlValidator.py
Normal file
@@ -0,0 +1,86 @@
|
|||||||
|
class InstaUrlValidator:
|
||||||
|
def __init__(self, input_url):
|
||||||
|
self.protocol = 'https'
|
||||||
|
self.host = 'www.instagram.com'
|
||||||
|
self.path1 = 'p'
|
||||||
|
|
||||||
|
self.input_user_key = ''
|
||||||
|
self.input_url = input_url
|
||||||
|
|
||||||
|
def preprocess_input_url(self):
|
||||||
|
if type(self.input_url) != str:
|
||||||
|
raise TypeError('input url error')
|
||||||
|
|
||||||
|
self.preprocessed_input_url = self.input_url.strip()
|
||||||
|
|
||||||
|
def check_protocol(self):
|
||||||
|
start_index = 0
|
||||||
|
end_index = self.preprocessed_input_url.find(':')
|
||||||
|
if end_index == -1:
|
||||||
|
return start_index
|
||||||
|
|
||||||
|
if self.preprocessed_input_url[end_index+1] != '/' or self.preprocessed_input_url[end_index+2] != '/':
|
||||||
|
raise ValueError('incorrect url format')
|
||||||
|
|
||||||
|
return end_index + 3
|
||||||
|
|
||||||
|
def check_host(self, start_index):
|
||||||
|
end_index = self.preprocessed_input_url.find('/', start_index)
|
||||||
|
if end_index == -1:
|
||||||
|
raise ValueError('incorrect url format')
|
||||||
|
|
||||||
|
input_host = self.preprocessed_input_url[start_index:end_index]
|
||||||
|
if input_host not in self.host:
|
||||||
|
raise ValueError('incorrect host')
|
||||||
|
|
||||||
|
return end_index + 1
|
||||||
|
|
||||||
|
def check_path1(self, start_index):
|
||||||
|
end_index = self.preprocessed_input_url.find('/', start_index)
|
||||||
|
if end_index == -1:
|
||||||
|
raise ValueError('incorrect path')
|
||||||
|
|
||||||
|
input_path1 = self.preprocessed_input_url[start_index:end_index]
|
||||||
|
if input_path1 != self.path1:
|
||||||
|
raise ValueError('incorrect path (/p/)')
|
||||||
|
|
||||||
|
return end_index + 1
|
||||||
|
|
||||||
|
def check_path2(self, start_index):
|
||||||
|
end_index = self.preprocessed_input_url.find('/', start_index)
|
||||||
|
# if end_index == -1:
|
||||||
|
# raise ValueError('incorrect path')
|
||||||
|
#
|
||||||
|
# self.input_user_key = self.preprocessed_input_url[start_index:end_index]
|
||||||
|
|
||||||
|
if end_index != -1:
|
||||||
|
self.input_user_key = self.preprocessed_input_url[start_index:end_index]
|
||||||
|
else:
|
||||||
|
self.input_user_key = self.preprocessed_input_url[start_index:]
|
||||||
|
|
||||||
|
def make_instagram_url(self):
|
||||||
|
if len(self.input_user_key) <= 0:
|
||||||
|
raise ValueError('incorrect user key')
|
||||||
|
|
||||||
|
url = self.protocol + '://' + self.host + '/' + self.path1 + '/' + self.input_user_key + '/'
|
||||||
|
return url
|
||||||
|
|
||||||
|
def validate_url(self):
|
||||||
|
try:
|
||||||
|
self.preprocess_input_url()
|
||||||
|
start_index = self.check_protocol()
|
||||||
|
start_index = self.check_host(start_index)
|
||||||
|
start_index = self.check_path1(start_index)
|
||||||
|
self.check_path2(start_index)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
raise e
|
||||||
|
|
||||||
|
def get_insta_url(self):
|
||||||
|
try:
|
||||||
|
self.validate_url()
|
||||||
|
url = self.make_instagram_url()
|
||||||
|
except Exception as e:
|
||||||
|
raise e
|
||||||
|
|
||||||
|
return url
|
||||||
@@ -36,6 +36,20 @@ insta_tag_url = "https://www.instagram.com/explore/tags/"
|
|||||||
insta_query = "https://www.instagram.com/query/"
|
insta_query = "https://www.instagram.com/query/"
|
||||||
insta_body_url = 'https://www.instagram.com/p/'
|
insta_body_url = 'https://www.instagram.com/p/'
|
||||||
|
|
||||||
|
DATE = 0
|
||||||
|
REPLY_DAY = 1
|
||||||
|
REPLY_ACC = 2
|
||||||
|
LIKE_DAY = 3
|
||||||
|
LIKE_ACC = 4
|
||||||
|
DAY = 5
|
||||||
|
ACC = 6
|
||||||
|
REPLY = 7
|
||||||
|
LIKE = 8
|
||||||
|
|
||||||
|
BUZZ_KEY = [
|
||||||
|
"date", "reply_day", "reply_acc", "like_day", "like_acc", "day", "acc"
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def requests_get(req, timeout=requests_timeout):
|
def requests_get(req, timeout=requests_timeout):
|
||||||
body = []
|
body = []
|
||||||
@@ -136,10 +150,25 @@ class InstaContent:
|
|||||||
|
|
||||||
|
|
||||||
class EffectInsta(object):
|
class EffectInsta(object):
|
||||||
def __init__(self, event_num, event_code, url):
|
|
||||||
|
def __init__(self, event_num, event_code, url, start_date):
|
||||||
self.event_num = event_num
|
self.event_num = event_num
|
||||||
self.event_code = event_code
|
self.event_code = event_code
|
||||||
self.url = url
|
self.url = url
|
||||||
|
self.start_date = start_date.replace("-", "")
|
||||||
|
self.database = self.database_init()
|
||||||
|
|
||||||
|
|
||||||
|
def database_init(self):
|
||||||
|
try:
|
||||||
|
cg = get_settings()
|
||||||
|
except Exception as e:
|
||||||
|
raise effect.effecterror.UnknownError(str(e) + '\n' + 'effect.ini setting error')
|
||||||
|
|
||||||
|
database = ResultSender(cg['host'], cg['user'], cg['pass'], cg['name']) if cg else ResultSender()
|
||||||
|
database.connect()
|
||||||
|
|
||||||
|
return database
|
||||||
|
|
||||||
def start(self):
|
def start(self):
|
||||||
#content = insta.instacrawl.InstaContent(self.url, {}, self.url)
|
#content = insta.instacrawl.InstaContent(self.url, {}, self.url)
|
||||||
@@ -187,8 +216,12 @@ class EffectInsta(object):
|
|||||||
result['replycount'] = int(body.get('article_order'), 0)
|
result['replycount'] = int(body.get('article_order'), 0)
|
||||||
result['likecount'] = int(body.get('reply_url'), 0)
|
result['likecount'] = int(body.get('reply_url'), 0)
|
||||||
result['interactioncount'] = self.get_replycount(body, replies)
|
result['interactioncount'] = self.get_replycount(body, replies)
|
||||||
result['replybuzz'] = self.get_reply_buzz(body, replies)
|
replybuzz = self.get_reply_buzz(body, replies)
|
||||||
|
likebuzzs = self.get_like_buzz(int(body.get('reply_url'), 0))
|
||||||
|
totalbuzz = self.summary_reply_and_like(replybuzz, likebuzzs)
|
||||||
|
result['replybuzz'] = json.dumps(totalbuzz, sort_keys=True)
|
||||||
result['engagementcount'] = result.get('likecount', 0) + result.get('replycount', 0)
|
result['engagementcount'] = result.get('likecount', 0) + result.get('replycount', 0)
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def get_replycount(self, body, replies):
|
def get_replycount(self, body, replies):
|
||||||
@@ -197,22 +230,171 @@ class EffectInsta(object):
|
|||||||
set_reply_id.add(i.get('article_id', ''))
|
set_reply_id.add(i.get('article_id', ''))
|
||||||
return len(set_reply_id) - 1 if body.get('article_id') in set_reply_id else len(set_reply_id)
|
return len(set_reply_id) - 1 if body.get('article_id') in set_reply_id else len(set_reply_id)
|
||||||
|
|
||||||
|
# def get_reply_buzz(self, body, replies):
|
||||||
|
# start_date = datetime.datetime.strptime(body['article_date'], '%Y-%m-%d %H:%M:%S').date()
|
||||||
|
# end_date = datetime.datetime.now().date()
|
||||||
|
# date_dict = dict()
|
||||||
|
# while start_date <= end_date:
|
||||||
|
# date_dict[start_date.strftime('%Y%m%d')] = 0
|
||||||
|
# start_date = start_date + datetime.timedelta(days=1)
|
||||||
|
#
|
||||||
|
# for reply in replies:
|
||||||
|
# str_reply_date = reply.get('article_date', '1990-01-01 00:00:00')
|
||||||
|
# reply_date = datetime.datetime.strptime(str_reply_date, '%Y-%m-%d %H:%M:%S').date().strftime('%Y%m%d')
|
||||||
|
# print(reply_date)
|
||||||
|
# if reply_date in date_dict:
|
||||||
|
# date_dict[reply_date] = date_dict[reply_date] + 1
|
||||||
|
#
|
||||||
|
# print(date_dict)
|
||||||
|
#
|
||||||
|
# json_array = [{'date': k, 'value': v} for k, v in date_dict.items()]
|
||||||
|
#
|
||||||
|
# return json.dumps(json_array, sort_keys=True)
|
||||||
|
|
||||||
def get_reply_buzz(self, body, replies):
|
def get_reply_buzz(self, body, replies):
|
||||||
start_date = datetime.datetime.strptime(body['article_date'], '%Y-%m-%d %H:%M:%S').date()
|
start_date = datetime.datetime.strptime(self.start_date, '%Y%m%d').date()
|
||||||
end_date = datetime.datetime.now().date()
|
today = datetime.datetime.now().date()
|
||||||
|
|
||||||
date_dict = dict()
|
date_dict = dict()
|
||||||
while start_date <= end_date:
|
while start_date <= today:
|
||||||
date_dict[start_date.strftime('%Y%m%d')] = 0
|
date_dict[start_date.strftime('%Y%m%d')] = 0
|
||||||
start_date = start_date + datetime.timedelta(days=1)
|
start_date = start_date + datetime.timedelta(days=1)
|
||||||
|
|
||||||
for reply in replies:
|
for reply in replies:
|
||||||
str_reply_date = reply.get('article_date', '1990-01-01 00:00:00')
|
str_reply_date = reply.get('article_date')
|
||||||
reply_date = datetime.datetime.strptime(str_reply_date, '%Y-%m-%d %H:%M:%S').date().strftime('%m-%d-%Y')
|
reply_date = datetime.datetime.strptime(str_reply_date, '%Y-%m-%d %H:%M:%S').date().strftime('%Y%m%d')
|
||||||
if reply_date in date_dict:
|
if reply_date in date_dict:
|
||||||
date_dict[reply_date] = date_dict[reply_date] + 1
|
date_dict[reply_date] = date_dict[reply_date] + 1
|
||||||
|
|
||||||
json_array = [{'date': k, 'value': v} for k, v in date_dict.items()]
|
reply_buzzs = self.make_dummy_buzzs(self.start_date, datetime.datetime.today().strftime('%Y%m%d'))
|
||||||
|
reply_acc_count = 0
|
||||||
|
for reply_buzz in reply_buzzs:
|
||||||
|
date = reply_buzz[BUZZ_KEY[DATE]]
|
||||||
|
reply_count = date_dict[date]
|
||||||
|
reply_acc_count += reply_count
|
||||||
|
reply_buzz[BUZZ_KEY[DAY]] = date_dict[date]
|
||||||
|
reply_buzz[BUZZ_KEY[ACC]] = reply_acc_count
|
||||||
|
|
||||||
return json.dumps(json_array, sort_keys=True)
|
# json_array = [{'date': k, 'value': v} for k, v in date_dict.items()]
|
||||||
|
# return json.dumps(json_array, sort_keys=True)
|
||||||
|
return reply_buzzs
|
||||||
|
|
||||||
|
def get_like_buzz(self, like_count):
|
||||||
|
start_date = datetime.datetime.strptime(self.start_date, '%Y%m%d').date()
|
||||||
|
today = datetime.datetime.today().strftime('%Y%m%d')
|
||||||
|
|
||||||
|
try:
|
||||||
|
buzzs = self.database.get_buzz(self.event_num)
|
||||||
|
if buzzs != None:
|
||||||
|
buzzs = json.loads(buzzs)
|
||||||
|
else:
|
||||||
|
buzzs = []
|
||||||
|
buzzs = self.get_buzzs(buzzs, LIKE)
|
||||||
|
like_dummy_buzzs = self.make_dummy_buzzs(self.start_date, today)
|
||||||
|
like_buzzs = self.fill_buzzs_into_dummy(buzzs, like_dummy_buzzs)
|
||||||
|
like_buzzs = self.put_today_buzz(like_buzzs, like_count)
|
||||||
|
except Exception as e:
|
||||||
|
raise effect.effecterror.DBQueryError(str(e))
|
||||||
|
|
||||||
|
return like_buzzs
|
||||||
|
|
||||||
|
def make_base_buzz_instance(self, values):
|
||||||
|
base_buzz_instance = dict()
|
||||||
|
base_buzz_instance[BUZZ_KEY[DATE]] = values[0]
|
||||||
|
base_buzz_instance[BUZZ_KEY[DAY]] = values[1]
|
||||||
|
base_buzz_instance[BUZZ_KEY[ACC]] = values[2]
|
||||||
|
|
||||||
|
return base_buzz_instance
|
||||||
|
|
||||||
|
def make_summary_buzz_instance(self, values):
|
||||||
|
summary_buzz_instance = dict()
|
||||||
|
summary_buzz_instance[BUZZ_KEY[DATE]] = values[0]
|
||||||
|
summary_buzz_instance[BUZZ_KEY[REPLY_DAY]] = values[1][BUZZ_KEY[DAY]]
|
||||||
|
summary_buzz_instance[BUZZ_KEY[REPLY_ACC]] = values[1][BUZZ_KEY[ACC]]
|
||||||
|
summary_buzz_instance[BUZZ_KEY[LIKE_DAY]] = values[2][BUZZ_KEY[DAY]]
|
||||||
|
summary_buzz_instance[BUZZ_KEY[LIKE_ACC]] = values[2][BUZZ_KEY[ACC]]
|
||||||
|
|
||||||
|
return summary_buzz_instance
|
||||||
|
|
||||||
|
def make_dummy_buzzs(self, start_date, end_date):
|
||||||
|
|
||||||
|
startdate = datetime.datetime.strptime(start_date, '%Y%m%d')
|
||||||
|
enddate = datetime.datetime.strptime(end_date, '%Y%m%d')
|
||||||
|
|
||||||
|
buzzs = []
|
||||||
|
while startdate <= enddate:
|
||||||
|
buzz_instance = self.make_base_buzz_instance([startdate.strftime('%Y%m%d'), 0, 0])
|
||||||
|
buzzs.append(buzz_instance)
|
||||||
|
|
||||||
|
startdate += datetime.timedelta(days=1)
|
||||||
|
|
||||||
|
return buzzs
|
||||||
|
|
||||||
|
def put_today_buzz(self, buzzs, today_acc_buzz_count):
|
||||||
|
today = datetime.date.today().strftime('%Y%m%d')
|
||||||
|
today_buzz_count = today_acc_buzz_count - buzzs[-2][BUZZ_KEY[ACC]]
|
||||||
|
# if today_buzz_count < 0:
|
||||||
|
# today_buzz_count = 0
|
||||||
|
|
||||||
|
result_buzzs = buzzs.copy()
|
||||||
|
result_buzzs[-1][BUZZ_KEY[DAY]] = today_buzz_count if today_buzz_count >=0 else 0
|
||||||
|
result_buzzs[-1][BUZZ_KEY[ACC]] = result_buzzs[-2][BUZZ_KEY[ACC]] + today_buzz_count
|
||||||
|
|
||||||
|
return result_buzzs
|
||||||
|
|
||||||
|
def fill_buzzs_into_dummy(self, buzzs, dummy):
|
||||||
|
buzzs_clone = buzzs.copy()
|
||||||
|
dummy_clone = dummy.copy()
|
||||||
|
|
||||||
|
for dummy_buzz, real_buzz in zip(dummy_clone, buzzs_clone):
|
||||||
|
dummy_buzz[BUZZ_KEY[DATE]] = real_buzz[BUZZ_KEY[DATE]]
|
||||||
|
dummy_buzz[BUZZ_KEY[DAY]] = real_buzz[BUZZ_KEY[DAY]]
|
||||||
|
dummy_buzz[BUZZ_KEY[ACC]] = real_buzz[BUZZ_KEY[ACC]]
|
||||||
|
|
||||||
|
for index, dummy_buzz in enumerate(dummy_clone):
|
||||||
|
previous_index = index - 1
|
||||||
|
previous_acc_value = dummy_clone[previous_index][BUZZ_KEY[ACC]]
|
||||||
|
current_acc_value = dummy_buzz[BUZZ_KEY[ACC]]
|
||||||
|
|
||||||
|
if previous_acc_value > 0 and current_acc_value == 0 and previous_index >= 0:
|
||||||
|
dummy_buzz[BUZZ_KEY[ACC]] = previous_acc_value
|
||||||
|
|
||||||
|
return dummy_clone
|
||||||
|
|
||||||
|
def get_buzzs(self, buzzs, buzz_type):
|
||||||
|
result_buzzs = []
|
||||||
|
if buzz_type == LIKE:
|
||||||
|
for buzz in buzzs:
|
||||||
|
buzz_instance = self.make_base_buzz_instance([
|
||||||
|
buzz[BUZZ_KEY[DATE]],
|
||||||
|
buzz[BUZZ_KEY[LIKE_DAY]],
|
||||||
|
buzz[BUZZ_KEY[LIKE_ACC]]
|
||||||
|
])
|
||||||
|
result_buzzs.append(buzz_instance)
|
||||||
|
|
||||||
|
return result_buzzs
|
||||||
|
|
||||||
|
def is_valid_data(self, reply_buzzs, like_buzzs):
|
||||||
|
reply_dates = self.get_date_list(reply_buzzs)
|
||||||
|
like_dates = self.get_date_list(like_buzzs)
|
||||||
|
|
||||||
|
if reply_dates == like_dates:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def summary_reply_and_like(self, reply_buzzs, like_buzzs):
|
||||||
|
# if self.is_valid_data(reply_buzzs, like_buzzs) == False:
|
||||||
|
# raise IndexError("")
|
||||||
|
|
||||||
|
summary_buzzs = []
|
||||||
|
for reply_buzz, like_buzz in zip(reply_buzzs, like_buzzs):
|
||||||
|
date = reply_buzz[BUZZ_KEY[DATE]]
|
||||||
|
summary_buzz_instance = self.make_summary_buzz_instance([
|
||||||
|
date,
|
||||||
|
reply_buzz,
|
||||||
|
like_buzz
|
||||||
|
])
|
||||||
|
summary_buzzs.append(summary_buzz_instance)
|
||||||
|
|
||||||
|
return summary_buzzs
|
||||||
@@ -37,6 +37,17 @@ class ResultSender:
|
|||||||
return query + ",".join(key_list) + ") values (" + ",".join(val_list) + ")" + " on duplicate key update " + \
|
return query + ",".join(key_list) + ") values (" + ",".join(val_list) + ")" + " on duplicate key update " + \
|
||||||
','.join(map(lambda x:x[0] + '=' + x[1], zip(key_list, val_list)))
|
','.join(map(lambda x:x[0] + '=' + x[1], zip(key_list, val_list)))
|
||||||
|
|
||||||
|
def get_buzz(self, event_num):
|
||||||
|
query = 'select replybuzz from stats_s1_effect where event_num = ' + str(event_num)
|
||||||
|
|
||||||
|
if not self.conn.open:
|
||||||
|
self.connect()
|
||||||
|
with self.conn.cursor() as cursor:
|
||||||
|
cursor.execute(query)
|
||||||
|
buzz = cursor.fetchone()
|
||||||
|
|
||||||
|
return buzz['replybuzz'] if buzz != None else buzz
|
||||||
|
|
||||||
def send(self, table_name, dictionary):
|
def send(self, table_name, dictionary):
|
||||||
query = self._make_query(table_name, dictionary)
|
query = self._make_query(table_name, dictionary)
|
||||||
self._exec_query(query)
|
self._exec_query(query)
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
import effect.effectinstagram
|
import effect.effectinstagram
|
||||||
import effect.effecterror
|
import effect.effecterror
|
||||||
import effect.effectkakaostory
|
import effect.effectkakaostory
|
||||||
|
from effect.InstaUrlValidator import InstaUrlValidator
|
||||||
from base.baseclasses import printl
|
from base.baseclasses import printl
|
||||||
import sys
|
import sys
|
||||||
import base.baseclasses
|
import base.baseclasses
|
||||||
@@ -33,9 +34,17 @@ def get_browser_info(platform_, file_name="browser.txt"):
|
|||||||
return options.get(platform_, options['default'])
|
return options.get(platform_, options['default'])
|
||||||
|
|
||||||
|
|
||||||
def get_effect_process(platform_, event_num, url):
|
def get_effect_process(platform_, event_num, url, start_date):
|
||||||
if platform_ == 'instagram':
|
if platform_ == 'instagram':
|
||||||
return effect.effectinstagram.EffectInsta(int(event_num), int(event_num), url)
|
try:
|
||||||
|
insta_url_validator = InstaUrlValidator(url)
|
||||||
|
insta_url = insta_url_validator.get_insta_url()
|
||||||
|
except Exception as e:
|
||||||
|
printl("x!@#!@#!@#e010!@#check url")
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
# return effect.effectinstagram.EffectInsta(int(event_num), int(event_num), url, start_date)
|
||||||
|
return effect.effectinstagram.EffectInsta(int(event_num), int(event_num), insta_url, start_date)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
browser_info = get_browser_info(platform_)
|
browser_info = get_browser_info(platform_)
|
||||||
@@ -52,14 +61,19 @@ if __name__ == '__main__':
|
|||||||
sys.argv[1] instagram, kakaostory, facebook
|
sys.argv[1] instagram, kakaostory, facebook
|
||||||
sys.argv[2] event_num
|
sys.argv[2] event_num
|
||||||
sys.argv[3] url
|
sys.argv[3] url
|
||||||
|
sys.argv[4] start date
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if len(sys.argv) != 4:
|
# if len(sys.argv) != 4:
|
||||||
|
# printl("x!@#!@#!@#e010!@#check argument")
|
||||||
|
# exit(1)
|
||||||
|
|
||||||
|
if len(sys.argv) != 5:
|
||||||
printl("x!@#!@#!@#e010!@#check argument")
|
printl("x!@#!@#!@#e010!@#check argument")
|
||||||
exit(1)
|
exit(1)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
effect_process = get_effect_process(sys.argv[1], sys.argv[2], sys.argv[3])
|
effect_process = get_effect_process(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4])
|
||||||
effect_process.start()
|
effect_process.start()
|
||||||
except effect.effecterror.EffectException as e:
|
except effect.effecterror.EffectException as e:
|
||||||
printl("x!@#" + str(sys.argv[2]) + "!@#" + str(sys.argv[3]) + "!@#" + str(e))
|
printl("x!@#" + str(sys.argv[2]) + "!@#" + str(sys.argv[3]) + "!@#" + str(e))
|
||||||
|
|||||||
91
WebBasedCrawler/facebook/facebookcrawl_new.py
Normal file
91
WebBasedCrawler/facebook/facebookcrawl_new.py
Normal file
@@ -0,0 +1,91 @@
|
|||||||
|
import time
|
||||||
|
from selenium.common.exceptions import WebDriverException
|
||||||
|
from base.baseclasses import find_element_by_css_selector
|
||||||
|
from base.baseclasses import find_elements_by_css_selector
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
try:
|
||||||
|
import lxml
|
||||||
|
parser_opt = 'lxml'
|
||||||
|
except ImportError:
|
||||||
|
parser_opt = 'html.parser'
|
||||||
|
|
||||||
|
limit_reload = 5
|
||||||
|
|
||||||
|
list_tag_css_selector = "div#initial_browse_result"
|
||||||
|
list_page_css_selector = "div#pagelet_timeline_main_column"
|
||||||
|
list_group_css_selector = "div#pagelet_group_"
|
||||||
|
each_post_css_selector = "div._4-u2._4-u8"
|
||||||
|
wait_second_for_find_element = 30
|
||||||
|
|
||||||
|
|
||||||
|
class ListBase(object):
|
||||||
|
def __init__(self, driver):
|
||||||
|
self.driver = driver
|
||||||
|
self.url_list = []
|
||||||
|
self.list_css_selector = None
|
||||||
|
self.list_container_dom = None
|
||||||
|
self.current_post = None
|
||||||
|
|
||||||
|
def set_url_elements(self):
|
||||||
|
elements = find_element_by_css_selector(self.driver,
|
||||||
|
self.list_css_selector + " " + each_post_css_selector,
|
||||||
|
wait_second_for_find_element)
|
||||||
|
self.url_list.extend(elements)
|
||||||
|
|
||||||
|
def move_first(self):
|
||||||
|
self.url_list = self.current_post.pop(0) if self.url_list else None
|
||||||
|
|
||||||
|
def move_next(self):
|
||||||
|
self.move_first()
|
||||||
|
|
||||||
|
def check_list_and_load(self):
|
||||||
|
for _ in range(limit_reload):
|
||||||
|
num_of_list = len(self.url_list)
|
||||||
|
if num_of_list < 2:
|
||||||
|
self.load_more_list()
|
||||||
|
num_of_list = self.get_num_of_list()
|
||||||
|
if not num_of_list:
|
||||||
|
raise WebDriverException("There is no data or ajax error")
|
||||||
|
|
||||||
|
def load_more_list(self):
|
||||||
|
position = self.driver.get_window_position()
|
||||||
|
size = self.driver.get_window_size()
|
||||||
|
self.driver.maximize_window()
|
||||||
|
self.driver.set_window_size(size['width'], size["height"])
|
||||||
|
self.driver.set_window_position(position['x'], position['y'])
|
||||||
|
for _ in range(2):
|
||||||
|
self.driver.execute_script("window.scrollBy(0, -400)")
|
||||||
|
time.sleep(0.3)
|
||||||
|
for _ in range(4):
|
||||||
|
self.driver.execute_script("window.scrollBy(0, 800)")
|
||||||
|
time.sleep(0.3)
|
||||||
|
|
||||||
|
def has_next(self):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def get_url(self):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def get_date(self):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def remove_current_post(self):
|
||||||
|
css_selector = "div#" + self.current_post.id
|
||||||
|
self.driver.execute_script('document.querySelector("' + css_selector + '").remove()')
|
||||||
|
|
||||||
|
def get_num_of_list(self):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
class ListTag(ListBase):
|
||||||
|
def __init__(self, driver):
|
||||||
|
super().__init__(driver)
|
||||||
|
self.list_css_selector = list_tag_css_selector
|
||||||
|
|
||||||
|
|
||||||
|
class ListPage(ListBase):
|
||||||
|
def __init__(self, driver):
|
||||||
|
self.driver = driver
|
||||||
|
self.list_css_selector = list_page_css_selector
|
||||||
|
|
||||||
197
WebBasedCrawler/facebook/facebookcrawltemp.py
Normal file
197
WebBasedCrawler/facebook/facebookcrawltemp.py
Normal file
@@ -0,0 +1,197 @@
|
|||||||
|
#-*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
import json
|
||||||
|
import datetime
|
||||||
|
import time
|
||||||
|
|
||||||
|
|
||||||
|
from selenium.webdriver.common.keys import Keys
|
||||||
|
from selenium.webdriver.common.action_chains import ActionChains
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
from selenium.common.exceptions import WebDriverException
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from base.baseclasses import SendtoDB
|
||||||
|
from base.baseclasses import print_and_flush
|
||||||
|
from base.baseclasses import CrawlInit
|
||||||
|
from base.baseclasses import wait
|
||||||
|
from base.baseclasses import find_element_by_css_selector
|
||||||
|
from base.baseclasses import find_elements_by_css_selector
|
||||||
|
from base.baseclasses import find_elements_by_xpath
|
||||||
|
from base.baseclasses import enter_element
|
||||||
|
from base.baseclasses import Browser
|
||||||
|
|
||||||
|
facebook_url = "http://bigbird.iptime.org/fbtest.php"
|
||||||
|
facebook_tag_url = "https://www.facebook.com/hashtag/"
|
||||||
|
|
||||||
|
facebook_id = 'concepters22@gmail.com'
|
||||||
|
facebook_password = 'zjstpqxjtm'
|
||||||
|
|
||||||
|
|
||||||
|
class FacebookInit(CrawlInit):
|
||||||
|
def __init__(self, before_day=0):
|
||||||
|
super().__init__(before_day)
|
||||||
|
self.urls = dict()
|
||||||
|
self.urls[11] = facebook_tag_url
|
||||||
|
self.urls[12] = facebook_url
|
||||||
|
|
||||||
|
def split_searches(self):
|
||||||
|
search = self.searches()
|
||||||
|
splited_list = search.split(',')
|
||||||
|
return [x.strip() if self.platform() == 12 else self.utf8(x) for x in splited_list]
|
||||||
|
# trimmed_list = list()
|
||||||
|
# if self.platform() == 12:
|
||||||
|
# for x in splited_list:
|
||||||
|
# trimmed_list.append(x.strip())
|
||||||
|
# else:
|
||||||
|
# for x in splited_list:
|
||||||
|
# trimmed_list.append(self.utf8(x))
|
||||||
|
# return trimmed_list
|
||||||
|
|
||||||
|
def make_url(self):
|
||||||
|
return [self.urls[self.platform()] + ('profile.php?id=' + x if x.isnumeric() else x) + "?fref=ts"
|
||||||
|
for x in self.split_searches()]
|
||||||
|
# return [self.urls[self.platform()] + x for x in self.split_searches()]
|
||||||
|
# urls = list()
|
||||||
|
# for x in self.split_searches():
|
||||||
|
# url = self.urls[self.platform()] + x + "?fref=ts"
|
||||||
|
# urls.append(url)
|
||||||
|
# return urls
|
||||||
|
|
||||||
|
def get_begin_day(self):
|
||||||
|
if self.is_realtime():
|
||||||
|
date_now = datetime.datetime.now()
|
||||||
|
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
|
||||||
|
result += datetime.timedelta(days=self.before_day)
|
||||||
|
return result
|
||||||
|
else:
|
||||||
|
return self.start_day()
|
||||||
|
|
||||||
|
def get_end_day(self):
|
||||||
|
if self.is_realtime():
|
||||||
|
date_now = datetime.datetime.now()
|
||||||
|
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
|
||||||
|
return result
|
||||||
|
else:
|
||||||
|
return self.end_day()
|
||||||
|
|
||||||
|
def is_hashtag(self):
|
||||||
|
return False if self.platform() == 12 else True
|
||||||
|
|
||||||
|
|
||||||
|
class FacebookMainCrawler:
|
||||||
|
def __init__(self):
|
||||||
|
self.crawl_init = FacebookInit()
|
||||||
|
self.browser = Browser()
|
||||||
|
self.driver = None
|
||||||
|
self.keyword_id = None
|
||||||
|
self.url = None
|
||||||
|
self.db_num = None
|
||||||
|
|
||||||
|
def set_driver(self, driver):
|
||||||
|
self.driver = driver
|
||||||
|
|
||||||
|
def set_keyword_id(self, keyword_id):
|
||||||
|
self.keyword_id = keyword_id
|
||||||
|
|
||||||
|
def start(self):
|
||||||
|
self.crawl_start()
|
||||||
|
|
||||||
|
def set_arguments(self, browser, keyword_id, db_num, before_day, until_page):
|
||||||
|
self.init_browser(browser)
|
||||||
|
self.init_keyword_id(keyword_id)
|
||||||
|
self.init_db(db_num)
|
||||||
|
self.init_before_day(before_day)
|
||||||
|
self.init_until_page(until_page)
|
||||||
|
|
||||||
|
def init_browser(self, browser):
|
||||||
|
self.set_driver(self.browser.get_new_driver(browser))
|
||||||
|
|
||||||
|
def init_keyword_id(self, keyword_id):
|
||||||
|
self.keyword_id = int(keyword_id) if type(keyword_id) != int else keyword_id
|
||||||
|
self.crawl_init.get_keyword_parameters(keyword_id)
|
||||||
|
self.crawl_init.disconnect()
|
||||||
|
|
||||||
|
def init_db(self, db_num):
|
||||||
|
self.db_num = db_num
|
||||||
|
|
||||||
|
def init_before_day(self, before_day):
|
||||||
|
self.crawl_init.set_before_day(before_day)
|
||||||
|
|
||||||
|
def init_until_page(self, until_page):
|
||||||
|
self.crawl_init.set_until_page(until_page)
|
||||||
|
|
||||||
|
def set_main_window_handler(self, window_handler):
|
||||||
|
self.main_window_handler = window_handler
|
||||||
|
|
||||||
|
def crawl_start(self):
|
||||||
|
real_time = True
|
||||||
|
while real_time:
|
||||||
|
print_and_flush("Crawler Start")
|
||||||
|
url_list = self.crawl_init.make_url()
|
||||||
|
i = 0
|
||||||
|
backup_set = set()
|
||||||
|
while i < len(url_list):
|
||||||
|
try:
|
||||||
|
self.set_main_window_handler(self.driver.window_handles[0])
|
||||||
|
print_and_flush(url_list[i] + "\n")
|
||||||
|
self.driver.get(url_list[i])
|
||||||
|
wait(5)
|
||||||
|
self.facebook_login()
|
||||||
|
body = self.driver.find_element_by_tag_name('body')
|
||||||
|
self.click_element(body)
|
||||||
|
self.page_crawler.set_date(begin_date=self.crawl_init.get_begin_day(),
|
||||||
|
end_date=self.crawl_init.get_end_day())
|
||||||
|
self.crawl_all_current_url(backup_set)
|
||||||
|
i += 1
|
||||||
|
backup_set.clear()
|
||||||
|
except Exception as e:
|
||||||
|
logging.info(e)
|
||||||
|
self.driver.quit()
|
||||||
|
self.set_driver(self.browser.new_browser())
|
||||||
|
wait(5)
|
||||||
|
real_time = self.crawl_init.is_realtime()
|
||||||
|
print_and_flush("Finished Crawling :)")
|
||||||
|
self.driver.quit()
|
||||||
|
|
||||||
|
def go_bigbird(self, driver):
|
||||||
|
driver.get(facebook_url)
|
||||||
|
|
||||||
|
def click_facebook_login(self, driver):
|
||||||
|
element_a = find_element_by_css_selector(driver, "a[href]", 15)
|
||||||
|
enter_element(element_a)
|
||||||
|
|
||||||
|
def login_facebook(self, driver, f_id, f_pw):
|
||||||
|
element_email = find_element_by_css_selector(driver, "input#email", 15)
|
||||||
|
element_password = find_element_by_css_selector(driver, "input#pass", 15)
|
||||||
|
element_button = find_element_by_css_selector(driver, "button#loginbutton", 15)
|
||||||
|
element_email.send_keys(f_id)
|
||||||
|
element_password.send_keys(f_pw)
|
||||||
|
enter_element(element_button)
|
||||||
|
|
||||||
|
def facebook_login(self):
|
||||||
|
try:
|
||||||
|
element_email = find_element_by_css_selector(self.driver, '#email', 15)
|
||||||
|
element_pwd = find_element_by_css_selector(self.driver, '#pass', 15)
|
||||||
|
except:
|
||||||
|
return
|
||||||
|
email = 'concepters22@gmail.com'
|
||||||
|
password = 'zjstpqxjtm'
|
||||||
|
element_email.send_keys(email)
|
||||||
|
element_pwd.send_keys(password)
|
||||||
|
label = self.driver.find_element_by_css_selector('#loginbutton')
|
||||||
|
element_input = label.find_element_by_xpath('input')
|
||||||
|
element_input.send_keys(Keys.NULL)
|
||||||
|
element_input.send_keys(Keys.ENTER)
|
||||||
|
wait(5)
|
||||||
|
|
||||||
|
def click_element(self, element):
|
||||||
|
ac = ActionChains(self.driver)
|
||||||
|
# ac.move_to_element_with_offset(element, 0, 0).click().perform()
|
||||||
|
ac.move_to_element(element).click().perform()
|
||||||
|
wait(4)
|
||||||
|
|
||||||
@@ -314,7 +314,6 @@ def crawl_content_process(qu, keyword_id, db_num):
|
|||||||
break
|
break
|
||||||
ok = True
|
ok = True
|
||||||
while ok:
|
while ok:
|
||||||
time.sleep(2)
|
|
||||||
try:
|
try:
|
||||||
# get a instance of InstaContent by do_no_proxy func.
|
# get a instance of InstaContent by do_no_proxy func.
|
||||||
# if element['url'] is invalid, content is None
|
# if element['url'] is invalid, content is None
|
||||||
|
|||||||
@@ -103,10 +103,10 @@ def parse_body_html(content):
|
|||||||
start_cursor = None
|
start_cursor = None
|
||||||
has_previous = False
|
has_previous = False
|
||||||
if postpage:
|
if postpage:
|
||||||
media = postpage[0]["media"]
|
media = postpage[0]["graphql"]["shortcode_media"]
|
||||||
body = {
|
body = {
|
||||||
"article_date": (old_date + datetime.timedelta(seconds=media["date"])).strftime("%Y-%m-%d %H:%M:%S"),
|
"article_date": (old_date + datetime.timedelta(seconds=media["taken_at_timestamp"])).strftime("%Y-%m-%d %H:%M:%S"),
|
||||||
"article_data": media["caption"],
|
"article_data": media["edge_media_to_caption"]["edges"][0]["node"]["text"],
|
||||||
"article_id": media["owner"]["username"],
|
"article_id": media["owner"]["username"],
|
||||||
"article_nickname": media["owner"]["username"],
|
"article_nickname": media["owner"]["username"],
|
||||||
"platform_id": media["owner"]["username"],
|
"platform_id": media["owner"]["username"],
|
||||||
@@ -115,22 +115,22 @@ def parse_body_html(content):
|
|||||||
"platform_title": media["owner"]["username"],
|
"platform_title": media["owner"]["username"],
|
||||||
"article_form": "body",
|
"article_form": "body",
|
||||||
"article_profileurl": media["owner"]["profile_pic_url"],
|
"article_profileurl": media["owner"]["profile_pic_url"],
|
||||||
"article_order": str(media["comments"]["count"]),
|
"article_order": str(media["edge_media_to_comment"]["count"]),
|
||||||
"article_hit": str(media.get('video_views', 0)),
|
"article_hit": str(0),
|
||||||
"reply_url": str(media["likes"]["count"])
|
"reply_url": str(media["edge_media_preview_like"]["count"])
|
||||||
}
|
}
|
||||||
comments = postpage[0]["media"]["comments"]
|
comments = postpage[0]["graphql"]["shortcode_media"]["edge_media_to_comment"]
|
||||||
has_previous = comments["page_info"]["has_previous_page"]
|
has_previous = comments["page_info"]["has_next_page"]
|
||||||
start_cursor = comments["page_info"]["start_cursor"]
|
start_cursor = comments["page_info"]["end_cursor"]
|
||||||
nodes = comments["nodes"]
|
nodes = comments["edges"]
|
||||||
for node in nodes:
|
for node in nodes:
|
||||||
reply.append({
|
reply.append({
|
||||||
"article_data": node["text"],
|
"article_data": node["node"]["text"],
|
||||||
"article_date":
|
"article_date":
|
||||||
(old_date + datetime.timedelta(seconds=node["created_at"])).strftime("%Y-%m-%d %H:%M:%S"),
|
(old_date + datetime.timedelta(seconds=node["node"]["created_at"])).strftime("%Y-%m-%d %H:%M:%S"),
|
||||||
"article_id": node["user"]["username"],
|
"article_id": node["node"]["owner"]["username"],
|
||||||
"article_nickname": node["user"]["username"],
|
"article_nickname": node["node"]["owner"]["username"],
|
||||||
"article_profileurl": node["user"]["profile_pic_url"],
|
"article_profileurl": node["node"]["owner"]["profile_pic_url"],
|
||||||
"platform_name": "instagram",
|
"platform_name": "instagram",
|
||||||
"platform_form": "post",
|
"platform_form": "post",
|
||||||
"article_form": "reply"
|
"article_form": "reply"
|
||||||
|
|||||||
Reference in New Issue
Block a user