4 space -> tab으로 수정

This commit is contained in:
mjjo
2017-08-10 12:44:03 +09:00
parent 16a9afbd9f
commit cba76dbe59
7 changed files with 275 additions and 275 deletions

View File

@@ -3,81 +3,81 @@ import datetime
from numbers import Number from numbers import Number
class DataDBRow: class DataDBRow:
def __init__(self): def __init__(self):
self.platform_name = None self.platform_name = None
self.platform_form = None self.platform_form = None
self.platform_title = None self.platform_title = None
self.article_form = None self.article_form = None
self.article_parent = None self.article_parent = None
self.article_id = None self.article_id = None
self.article_nickname = None self.article_nickname = None
self.article_title = None self.article_title = None
self.article_data = None self.article_data = None
self.article_url = None self.article_url = None
self.article_hit = 0 self.article_hit = 0
self.article_date = None self.article_date = None
self.article_order = 0 self.article_order = 0
self.article_profile = None self.article_profile = None
self.article_profileurl = None self.article_profileurl = None
self.platform_id = None self.platform_id = None
self.keyword_id = -1 self.keyword_id = -1
self.reply_url = None self.reply_url = None
self.etc = None self.etc = None
def get_keys(self): def get_keys(self):
inst = DataDBRow() inst = DataDBRow()
keys = () keys = ()
for key, value_type in inst.__dict__.items(): for key, value_type in inst.__dict__.items():
if key.startswith('__') or callable(value_type): if key.startswith('__') or callable(value_type):
continue continue
keys += key, keys += key,
return keys return keys
def get_values(self, conn, db_num): def get_values(self, conn, db_num):
inst = DataDBRow() inst = DataDBRow()
values = () values = ()
for key, value_type in inst.__dict__.items(): for key, value_type in inst.__dict__.items():
if key.startswith('__') or callable(value_type): if key.startswith('__') or callable(value_type):
continue continue
value = self.__dict__[key] value = self.__dict__[key]
if isinstance(value, Number): if isinstance(value, Number):
values += str(value), values += str(value),
elif isinstance(value, str): elif isinstance(value, str):
values += conn.escape(value.encode('utf8').decode('utf8')), values += conn.escape(value.encode('utf8').decode('utf8')),
else: else:
values += conn.escape(value), values += conn.escape(value),
return values return values
def get_delete_query(self, db_num): def get_delete_query(self, db_num):
query = """delete from data_{} where article_url='{}'""".format(db_num, self.article_url) query = """delete from data_{} where article_url='{}'""".format(db_num, self.article_url)
return query return query
def get_insert_query(self, conn, db_num): def get_insert_query(self, conn, db_num):
inst = DataDBRow() inst = DataDBRow()
keys = '' keys = ''
values = '' values = ''
for key, value_type in inst.__dict__.items(): for key, value_type in inst.__dict__.items():
if key.startswith('__') or callable(value_type): if key.startswith('__') or callable(value_type):
continue continue
if len(keys) > 0: if len(keys) > 0:
keys += ', ' keys += ', '
values += ', ' values += ', '
keys += key keys += key
value = self.__dict__[key] value = self.__dict__[key]
if isinstance(value, Number): if isinstance(value, Number):
values += str(value) values += str(value)
elif isinstance(value, str): elif isinstance(value, str):
values += conn.escape(value.encode('utf8').decode('utf8')) values += conn.escape(value.encode('utf8').decode('utf8'))
else: else:
values += conn.escape(value) values += conn.escape(value)
query = 'insert into data_{} ({}) values ({})'.format(db_num, keys, values) query = 'insert into data_{} ({}) values ({})'.format(db_num, keys, values)
return query return query

View File

@@ -4,10 +4,10 @@ import base.logger as logger
def print_exception(obj=None): def print_exception(obj=None):
exc_type, exc_obj, tb = sys.exc_info() exc_type, exc_obj, tb = sys.exc_info()
f = tb.tb_frame f = tb.tb_frame
lineno = tb.tb_lineno lineno = tb.tb_lineno
filename = f.f_code.co_filename filename = f.f_code.co_filename
linecache.checkcache(filename) linecache.checkcache(filename)
line = linecache.getline(filename, lineno, f.f_globals) line = linecache.getline(filename, lineno, f.f_globals)
logger.log('({}({}) Exception from "{}"):\n {}, {}'.format(filename, lineno, line.strip(), exc_obj, obj if obj else ''), logger.LogLevel.ERROR) logger.log('({}({}) Exception from "{}"):\n {}, {}'.format(filename, lineno, line.strip(), exc_obj, obj if obj else ''), logger.LogLevel.ERROR)

View File

@@ -223,8 +223,8 @@ class Proxy2Handler:
self.lock.acquire() self.lock.acquire()
for proxy in proxies: for proxy in proxies:
query = r"INSERT INTO proxy2(ip, PORT) " \ query = r"INSERT INTO proxy2(ip, PORT) " \
r"SELECT '{}', {} FROM DUAL " \ r"SELECT '{}', {} FROM DUAL " \
r"WHERE NOT EXISTS (SELECT * FROM proxy2 WHERE ip='{}' AND PORT={})"\ r"WHERE NOT EXISTS (SELECT * FROM proxy2 WHERE ip='{}' AND PORT={})"\
.format(proxy['ip'], proxy['port'], proxy['ip'], proxy['port']) .format(proxy['ip'], proxy['port'], proxy['ip'], proxy['port'])
# 안됨 - 중복으로 들어감, 쓰레드 종료됨 # 안됨 - 중복으로 들어감, 쓰레드 종료됨
self.engine.execute(query) self.engine.execute(query)

View File

@@ -2,69 +2,69 @@ import datetime
import copy import copy
class TwitterConfig: class TwitterConfig:
protocol = 'https' protocol = 'https'
top_url = 'twitter.com' top_url = 'twitter.com'
search_url = '/i/search/timeline' search_url = '/i/search/timeline'
conversation_url_form = '/i/{}/conversation/{}' conversation_url_form = '/i/{}/conversation/{}'
def __init__(self): def __init__(self):
self.keyword_id = -1 self.keyword_id = -1
self.db_num = -1 self.db_num = -1
self.id = 0 self.id = 0
self.realtime = False self.realtime = False
self.keywords = [] self.keywords = []
self.start_str = None self.start_str = None
self.start = None self.start = None
self.end_str = None self.end_str = None
self.end = None self.end = None
self.authorship = None self.authorship = None
self.state = None self.state = None
self.platform = None self.platform = None
def set_param(self, keyword_id, db_num, params): def set_param(self, keyword_id, db_num, params):
self.keyword_id = int(keyword_id) self.keyword_id = int(keyword_id)
self.db_num = int(db_num) self.db_num = int(db_num)
self.id = int(params['id']) self.id = int(params['id'])
self.realtime = params['realtime'] == 1 self.realtime = params['realtime'] == 1
self.keywords = [] self.keywords = []
for keyword in params['searches'].split(','): for keyword in params['searches'].split(','):
self.keywords.append(keyword.strip()) self.keywords.append(keyword.strip())
self.start_str = str(params['start']) self.start_str = str(params['start'])
self.end_str = str(params['end']) self.end_str = str(params['end'])
self.start = datetime.datetime.strptime(self.start_str, '%Y-%m-%d') self.start = datetime.datetime.strptime(self.start_str, '%Y-%m-%d')
self.end = datetime.datetime.strptime(self.end_str, '%Y-%m-%d') self.end = datetime.datetime.strptime(self.end_str, '%Y-%m-%d')
self.authorship = params['authorship'] self.authorship = params['authorship']
self.state = params['state'] self.state = params['state']
self.platform = params['platform'] self.platform = params['platform']
def reload_realtime(self, before_day): def reload_realtime(self, before_day):
if not self.realtime: if not self.realtime:
return return
self.end_str = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d') self.end_str = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d')
self.end = datetime.datetime.strptime(self.end_str, '%Y-%m-%d') self.end = datetime.datetime.strptime(self.end_str, '%Y-%m-%d')
self.start = self.end + datetime.timedelta(days=int(before_day)) self.start = self.end + datetime.timedelta(days=int(before_day))
self.start_str = datetime.datetime.strftime(self.start, '%Y-%m-%d') self.start_str = datetime.datetime.strftime(self.start, '%Y-%m-%d')
def split(self): def split(self):
split_list = [] split_list = []
new_end = self.end new_end = self.end
while new_end > self.start: while new_end > self.start:
new_config = copy.deepcopy(self) new_config = copy.deepcopy(self)
new_config.end = new_end new_config.end = new_end
new_end = new_end + datetime.timedelta(days=-1) new_end = new_end + datetime.timedelta(days=-1)
new_config.start = new_end new_config.start = new_end
new_config.start_str = new_config.start.strftime('%Y-%m-%d') new_config.start_str = new_config.start.strftime('%Y-%m-%d')
new_config.end_str = new_config.end.strftime('%Y-%m-%d') new_config.end_str = new_config.end.strftime('%Y-%m-%d')
split_list.append(new_config) split_list.append(new_config)
return split_list return split_list

View File

@@ -3,81 +3,81 @@ import queue
class TwitterDBHelper: class TwitterDBHelper:
pymysql = __import__('pymysql.cursors') pymysql = __import__('pymysql.cursors')
DB_DUMP_SIZE = 128 DB_DUMP_SIZE = 128
def __init__(self): def __init__(self):
self.tweets = [] self.tweets = []
self.buffer = [] self.buffer = []
self.queue = queue.Queue() self.queue = queue.Queue()
pass pass
def __del__(self): def __del__(self):
self.flush() self.flush()
pass pass
def get_param(self, keyword_id): def get_param(self, keyword_id):
query = "select * from keyword where id = " + str(keyword_id) query = "select * from keyword where id = " + str(keyword_id)
params = [] params = []
try: try:
conn = self.pymysql.connect(host='bigbird.iptime.org', conn = self.pymysql.connect(host='bigbird.iptime.org',
user='admin', passwd='admin123', user='admin', passwd='admin123',
db='concepters', charset='utf8', db='concepters', charset='utf8',
cursorclass=self.pymysql.cursors.DictCursor) cursorclass=self.pymysql.cursors.DictCursor)
with conn.cursor() as cursor: with conn.cursor() as cursor:
cursor.execute(query) cursor.execute(query)
params = cursor.fetchone() params = cursor.fetchone()
except Exception as e: except Exception as e:
print(e) print(e)
exit(1) exit(1)
else: else:
conn.close() conn.close()
return params return params
def flush(self): def flush(self):
local_buffer = [] local_buffer = []
while not self.queue.empty(): while not self.queue.empty():
local_buffer.append(self.queue.get()) local_buffer.append(self.queue.get())
print('### db queue dump {}'.format(len(local_buffer))) print('### db queue dump {}'.format(len(local_buffer)))
if len(local_buffer) > 0: if len(local_buffer) > 0:
while True: while True:
try: try:
conn = self.pymysql.connect(host='bigbird.iptime.org', conn = self.pymysql.connect(host='bigbird.iptime.org',
user='admin', passwd='admin123', user='admin', passwd='admin123',
db='concepters', charset='utf8', db='concepters', charset='utf8',
cursorclass=self.pymysql.cursors.DictCursor, cursorclass=self.pymysql.cursors.DictCursor,
connect_timeout=5) connect_timeout=5)
except Exception as e: except Exception as e:
print(e) print(e)
continue continue
else: else:
break break
try: try:
with conn.cursor() as cursor: with conn.cursor() as cursor:
for tweet, _db_num in local_buffer: for tweet, _db_num in local_buffer:
if not tweet.is_reply: if not tweet.is_reply:
query = tweet.get_delete_query(_db_num) query = tweet.get_delete_query(_db_num)
cursor.execute(query) cursor.execute(query)
query = tweet.get_insert_query(conn, _db_num) query = tweet.get_insert_query(conn, _db_num)
cursor.execute(query) cursor.execute(query)
conn.commit() conn.commit()
except Exception as e: except Exception as e:
print(e) print(e)
finally: finally:
conn.close() conn.close()
def insert_tweet(self, tweet: Tweet = None, db_num: int = -1, flush=False): def insert_tweet(self, tweet: Tweet = None, db_num: int = -1, flush=False):
self.queue.put((tweet, db_num)) self.queue.put((tweet, db_num))
if self.queue.qsize() >= self.DB_DUMP_SIZE: if self.queue.qsize() >= self.DB_DUMP_SIZE:
self.flush() self.flush()

View File

@@ -3,22 +3,22 @@ from base.dbdata import DataDBRow
class Tweet(DataDBRow): class Tweet(DataDBRow):
def __init__(self): def __init__(self):
super(self.__class__, self).__init__() super(self.__class__, self).__init__()
self.tweet_id = None self.tweet_id = None
self.user_id = None self.user_id = None
self.user_name = None self.user_name = None
self.text = None self.text = None
self.created_at = None self.created_at = None
self.retweets = 0 self.retweets = 0
self.favorites = 0 self.favorites = 0
self.is_reply = False self.is_reply = False
self.reply_cnt = 0 self.reply_cnt = 0
self.retweet_cnt = 0 self.retweet_cnt = 0
self.favorite_cnt = 0 self.favorite_cnt = 0
self.top_link = None self.top_link = None
self.tweet_link = None self.tweet_link = None
self.depth = 0 self.depth = 0

View File

@@ -7,90 +7,90 @@ import pytz
class TweetParser: class TweetParser:
@staticmethod @staticmethod
def parse(tag, keyword_id, depth=0, top_tw: Tweet=None): def parse(tag, keyword_id, depth=0, top_tw: Tweet=None):
tweet = Tweet() tweet = Tweet()
tweet.tweet_id = int(tag.attrs['data-tweet-id']) tweet.tweet_id = int(tag.attrs['data-tweet-id'])
nickname_tag = tag.select('strong.fullname')[0] nickname_tag = tag.select('strong.fullname')[0]
tweet.user_name = '' tweet.user_name = ''
for child in nickname_tag.children: for child in nickname_tag.children:
if isinstance(child, bs4.element.NavigableString): if isinstance(child, bs4.element.NavigableString):
if len(tweet.user_name) > 0: if len(tweet.user_name) > 0:
tweet.user_name += ' ' tweet.user_name += ' '
tweet.user_name += child tweet.user_name += child
tweet.user_id = tag.select('span.username')[0].text[1:] tweet.user_id = tag.select('span.username')[0].text[1:]
tweet.text = tag.select('p.tweet-text')[0].text tweet.text = tag.select('p.tweet-text')[0].text
# time_str = tag.select('a.tweet-timestamp')[0].attrs['title'] # time_str = tag.select('a.tweet-timestamp')[0].attrs['title']
# english # english
# tweet.created_at = datetime.datetime.strptime(time_str, '%I:%M %p - %d %b %Y') # tweet.created_at = datetime.datetime.strptime(time_str, '%I:%M %p - %d %b %Y')
# korean # korean
# time_str = time_str.replace('오전', 'AM').replace('오후', 'PM') # time_str = time_str.replace('오전', 'AM').replace('오후', 'PM')
# tweet.created_at = datetime.datetime.strptime(time_str, '%p %I:%M - %Y년 %m월 %d일') # tweet.created_at = datetime.datetime.strptime(time_str, '%p %I:%M - %Y년 %m월 %d일')
timestamp = int(tag.select('span._timestamp')[0].attrs['data-time']) timestamp = int(tag.select('span._timestamp')[0].attrs['data-time'])
utc_dt = datetime.datetime.utcfromtimestamp(timestamp) utc_dt = datetime.datetime.utcfromtimestamp(timestamp)
local_tz = pytz.timezone('Asia/Seoul') local_tz = pytz.timezone('Asia/Seoul')
local_dt = utc_dt.replace(tzinfo=pytz.utc).astimezone(local_tz) local_dt = utc_dt.replace(tzinfo=pytz.utc).astimezone(local_tz)
tweet.created_at = local_tz.normalize(local_dt) tweet.created_at = local_tz.normalize(local_dt)
reply_tag = tag.select('div.ReplyingToContextBelowAuthor') reply_tag = tag.select('div.ReplyingToContextBelowAuthor')
tweet.is_reply = len(reply_tag) > 0 tweet.is_reply = len(reply_tag) > 0
reply_cnt_tag = tag.select('span.ProfileTweet-action--reply > span.ProfileTweet-actionCount') reply_cnt_tag = tag.select('span.ProfileTweet-action--reply > span.ProfileTweet-actionCount')
if len(reply_cnt_tag) > 0: if len(reply_cnt_tag) > 0:
tweet.reply_cnt = int(reply_cnt_tag[0].attrs['data-tweet-stat-count']) tweet.reply_cnt = int(reply_cnt_tag[0].attrs['data-tweet-stat-count'])
retweet_cnt_tag = tag.select('span.ProfileTweet-action--retweet > span.ProfileTweet-actionCount') retweet_cnt_tag = tag.select('span.ProfileTweet-action--retweet > span.ProfileTweet-actionCount')
if len(retweet_cnt_tag) > 0: if len(retweet_cnt_tag) > 0:
tweet.retweet_cnt = int(retweet_cnt_tag[0].attrs['data-tweet-stat-count']) tweet.retweet_cnt = int(retweet_cnt_tag[0].attrs['data-tweet-stat-count'])
favorite_cnt_tag = tag.select('span.ProfileTweet-action--favorite > span.ProfileTweet-actionCount') favorite_cnt_tag = tag.select('span.ProfileTweet-action--favorite > span.ProfileTweet-actionCount')
if len(favorite_cnt_tag) > 0: if len(favorite_cnt_tag) > 0:
tweet.favorites_cnt = int(favorite_cnt_tag[0].attrs['data-tweet-stat-count']) tweet.favorites_cnt = int(favorite_cnt_tag[0].attrs['data-tweet-stat-count'])
link_tag = tag.select('a.js-permalink') link_tag = tag.select('a.js-permalink')
if len(link_tag) > 0: if len(link_tag) > 0:
tweet.tweet_link = TwitterConfig.protocol + '://' + TwitterConfig.top_url + link_tag[0].attrs['href'] tweet.tweet_link = TwitterConfig.protocol + '://' + TwitterConfig.top_url + link_tag[0].attrs['href']
tweet.top_link = top_tw.tweet_link if top_tw else tweet.tweet_link tweet.top_link = top_tw.tweet_link if top_tw else tweet.tweet_link
tweet.depth = depth tweet.depth = depth
tweet.platform_name = 'twitter' tweet.platform_name = 'twitter'
tweet.platform_form = 'post' tweet.platform_form = 'post'
tweet.platform_title = top_tw.user_id if top_tw else tweet.user_id tweet.platform_title = top_tw.user_id if top_tw else tweet.user_id
tweet.article_form = 'body' if tweet.depth is 0 else 'reply' tweet.article_form = 'body' if tweet.depth is 0 else 'reply'
# tweet.article_parent = None # tweet.article_parent = None
tweet.article_id = tweet.user_id tweet.article_id = tweet.user_id
tweet.article_nickname = tweet.user_name tweet.article_nickname = tweet.user_name
# tweet.article_title = None # tweet.article_title = None
tweet.article_data = tweet.text tweet.article_data = tweet.text
tweet.article_url = tweet.top_link tweet.article_url = tweet.top_link
# tweet.article_hit = 0 # tweet.article_hit = 0
tweet.article_date = tweet.created_at tweet.article_date = tweet.created_at
tweet.article_order = tweet.depth tweet.article_order = tweet.depth
# tweet.article_profile = tweet.user_name # tweet.article_profile = tweet.user_name
tweet.article_profileurl = TwitterConfig.protocol + '://' + TwitterConfig.top_url + '/' + tweet.user_id tweet.article_profileurl = TwitterConfig.protocol + '://' + TwitterConfig.top_url + '/' + tweet.user_id
tweet.platform_id = top_tw.user_id if top_tw else tweet.user_id tweet.platform_id = top_tw.user_id if top_tw else tweet.user_id
tweet.keyword_id = keyword_id tweet.keyword_id = keyword_id
tweet.reply_url = tweet.tweet_link tweet.reply_url = tweet.tweet_link
# tweet.etc = '' # tweet.etc = ''
return tweet return tweet
@staticmethod @staticmethod
def get_lone_container(soup, parent_tw): def get_lone_container(soup, parent_tw):
lone_tweets = soup.select('div.ThreadedConversation--loneTweet') lone_tweets = soup.select('div.ThreadedConversation--loneTweet')
container_tags = [] container_tags = []
for tag in reversed(lone_tweets): for tag in reversed(lone_tweets):
li = tag.select('li.stream-item') li = tag.select('li.stream-item')
if len(li) > 0 and 'data-item-id' in li[0].attrs: if len(li) > 0 and 'data-item-id' in li[0].attrs:
tweet_id = int(li[0].attrs['data-item-id']) tweet_id = int(li[0].attrs['data-item-id'])
if tweet_id == parent_tw.tweet_id: if tweet_id == parent_tw.tweet_id:
break break
container_tags.append(tag) container_tags.append(tag)
return reversed(container_tags) return reversed(container_tags)