From cba76dbe59f37ce541cc22e347d4a5b5ca16a038 Mon Sep 17 00:00:00 2001 From: mjjo Date: Thu, 10 Aug 2017 12:44:03 +0900 Subject: [PATCH] =?UTF-8?q?4=20space=20->=20tab=EC=9C=BC=EB=A1=9C=20?= =?UTF-8?q?=EC=88=98=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- WebBasedCrawler/base/dbdata.py | 130 +++++++++++------------ WebBasedCrawler/base/debug.py | 14 +-- WebBasedCrawler/base/proxy2.py | 4 +- WebBasedCrawler/twitter/twconfig.py | 102 +++++++++--------- WebBasedCrawler/twitter/twdbhelper.py | 124 +++++++++++----------- WebBasedCrawler/twitter/tweet.py | 32 +++--- WebBasedCrawler/twitter/twparser.py | 144 +++++++++++++------------- 7 files changed, 275 insertions(+), 275 deletions(-) diff --git a/WebBasedCrawler/base/dbdata.py b/WebBasedCrawler/base/dbdata.py index e4c2416..01b9e8f 100644 --- a/WebBasedCrawler/base/dbdata.py +++ b/WebBasedCrawler/base/dbdata.py @@ -3,81 +3,81 @@ import datetime from numbers import Number class DataDBRow: - def __init__(self): - self.platform_name = None - self.platform_form = None - self.platform_title = None - self.article_form = None - self.article_parent = None - self.article_id = None - self.article_nickname = None - self.article_title = None - self.article_data = None - self.article_url = None - self.article_hit = 0 - self.article_date = None - self.article_order = 0 - self.article_profile = None - self.article_profileurl = None - self.platform_id = None - self.keyword_id = -1 - self.reply_url = None - self.etc = None + def __init__(self): + self.platform_name = None + self.platform_form = None + self.platform_title = None + self.article_form = None + self.article_parent = None + self.article_id = None + self.article_nickname = None + self.article_title = None + self.article_data = None + self.article_url = None + self.article_hit = 0 + self.article_date = None + self.article_order = 0 + self.article_profile = None + self.article_profileurl = None + self.platform_id = None + self.keyword_id = -1 + self.reply_url = None + self.etc = None - def get_keys(self): - inst = DataDBRow() - keys = () - for key, value_type in inst.__dict__.items(): - if key.startswith('__') or callable(value_type): - continue + def get_keys(self): + inst = DataDBRow() + keys = () + for key, value_type in inst.__dict__.items(): + if key.startswith('__') or callable(value_type): + continue - keys += key, + keys += key, - return keys + return keys - def get_values(self, conn, db_num): - inst = DataDBRow() - values = () - for key, value_type in inst.__dict__.items(): - if key.startswith('__') or callable(value_type): - continue + def get_values(self, conn, db_num): + inst = DataDBRow() + values = () + for key, value_type in inst.__dict__.items(): + if key.startswith('__') or callable(value_type): + continue - value = self.__dict__[key] - if isinstance(value, Number): - values += str(value), - elif isinstance(value, str): - values += conn.escape(value.encode('utf8').decode('utf8')), - else: - values += conn.escape(value), + value = self.__dict__[key] + if isinstance(value, Number): + values += str(value), + elif isinstance(value, str): + values += conn.escape(value.encode('utf8').decode('utf8')), + else: + values += conn.escape(value), - return values + return values - def get_delete_query(self, db_num): - query = """delete from data_{} where article_url='{}'""".format(db_num, self.article_url) - return query + def get_delete_query(self, db_num): + query = """delete from data_{} where article_url='{}'""".format(db_num, self.article_url) + return query - def get_insert_query(self, conn, db_num): + def get_insert_query(self, conn, db_num): - inst = DataDBRow() + inst = DataDBRow() - keys = '' - values = '' - for key, value_type in inst.__dict__.items(): - if key.startswith('__') or callable(value_type): - continue + keys = '' + values = '' + for key, value_type in inst.__dict__.items(): + if key.startswith('__') or callable(value_type): + continue - if len(keys) > 0: - keys += ', ' - values += ', ' + if len(keys) > 0: + keys += ', ' + values += ', ' - keys += key - value = self.__dict__[key] - if isinstance(value, Number): - values += str(value) - elif isinstance(value, str): - values += conn.escape(value.encode('utf8').decode('utf8')) - else: - values += conn.escape(value) + keys += key + value = self.__dict__[key] + if isinstance(value, Number): + values += str(value) + elif isinstance(value, str): + values += conn.escape(value.encode('utf8').decode('utf8')) + else: + values += conn.escape(value) - query = 'insert into data_{} ({}) values ({})'.format(db_num, keys, values) - return query + query = 'insert into data_{} ({}) values ({})'.format(db_num, keys, values) + return query diff --git a/WebBasedCrawler/base/debug.py b/WebBasedCrawler/base/debug.py index da708ab..4bcb302 100644 --- a/WebBasedCrawler/base/debug.py +++ b/WebBasedCrawler/base/debug.py @@ -4,10 +4,10 @@ import base.logger as logger def print_exception(obj=None): - exc_type, exc_obj, tb = sys.exc_info() - f = tb.tb_frame - lineno = tb.tb_lineno - filename = f.f_code.co_filename - linecache.checkcache(filename) - line = linecache.getline(filename, lineno, f.f_globals) - logger.log('({}({}) Exception from "{}"):\n {}, {}'.format(filename, lineno, line.strip(), exc_obj, obj if obj else ''), logger.LogLevel.ERROR) + exc_type, exc_obj, tb = sys.exc_info() + f = tb.tb_frame + lineno = tb.tb_lineno + filename = f.f_code.co_filename + linecache.checkcache(filename) + line = linecache.getline(filename, lineno, f.f_globals) + logger.log('({}({}) Exception from "{}"):\n {}, {}'.format(filename, lineno, line.strip(), exc_obj, obj if obj else ''), logger.LogLevel.ERROR) diff --git a/WebBasedCrawler/base/proxy2.py b/WebBasedCrawler/base/proxy2.py index 0d95faf..adb61e3 100644 --- a/WebBasedCrawler/base/proxy2.py +++ b/WebBasedCrawler/base/proxy2.py @@ -223,8 +223,8 @@ class Proxy2Handler: self.lock.acquire() for proxy in proxies: query = r"INSERT INTO proxy2(ip, PORT) " \ - r"SELECT '{}', {} FROM DUAL " \ - r"WHERE NOT EXISTS (SELECT * FROM proxy2 WHERE ip='{}' AND PORT={})"\ + r"SELECT '{}', {} FROM DUAL " \ + r"WHERE NOT EXISTS (SELECT * FROM proxy2 WHERE ip='{}' AND PORT={})"\ .format(proxy['ip'], proxy['port'], proxy['ip'], proxy['port']) # 안됨 - 중복으로 들어감, 쓰레드 종료됨 self.engine.execute(query) diff --git a/WebBasedCrawler/twitter/twconfig.py b/WebBasedCrawler/twitter/twconfig.py index 4208ef7..3884193 100644 --- a/WebBasedCrawler/twitter/twconfig.py +++ b/WebBasedCrawler/twitter/twconfig.py @@ -2,69 +2,69 @@ import datetime import copy class TwitterConfig: - protocol = 'https' - top_url = 'twitter.com' - search_url = '/i/search/timeline' - conversation_url_form = '/i/{}/conversation/{}' + protocol = 'https' + top_url = 'twitter.com' + search_url = '/i/search/timeline' + conversation_url_form = '/i/{}/conversation/{}' - def __init__(self): - self.keyword_id = -1 - self.db_num = -1 + def __init__(self): + self.keyword_id = -1 + self.db_num = -1 - self.id = 0 - self.realtime = False - self.keywords = [] - self.start_str = None - self.start = None - self.end_str = None - self.end = None - self.authorship = None - self.state = None - self.platform = None + self.id = 0 + self.realtime = False + self.keywords = [] + self.start_str = None + self.start = None + self.end_str = None + self.end = None + self.authorship = None + self.state = None + self.platform = None - def set_param(self, keyword_id, db_num, params): - self.keyword_id = int(keyword_id) - self.db_num = int(db_num) + def set_param(self, keyword_id, db_num, params): + self.keyword_id = int(keyword_id) + self.db_num = int(db_num) - self.id = int(params['id']) - self.realtime = params['realtime'] == 1 + self.id = int(params['id']) + self.realtime = params['realtime'] == 1 - self.keywords = [] - for keyword in params['searches'].split(','): - self.keywords.append(keyword.strip()) + self.keywords = [] + for keyword in params['searches'].split(','): + self.keywords.append(keyword.strip()) - self.start_str = str(params['start']) - self.end_str = str(params['end']) - self.start = datetime.datetime.strptime(self.start_str, '%Y-%m-%d') - self.end = datetime.datetime.strptime(self.end_str, '%Y-%m-%d') + self.start_str = str(params['start']) + self.end_str = str(params['end']) + self.start = datetime.datetime.strptime(self.start_str, '%Y-%m-%d') + self.end = datetime.datetime.strptime(self.end_str, '%Y-%m-%d') - self.authorship = params['authorship'] - self.state = params['state'] - self.platform = params['platform'] + self.authorship = params['authorship'] + self.state = params['state'] + self.platform = params['platform'] - def reload_realtime(self, before_day): - if not self.realtime: - return + def reload_realtime(self, before_day): + if not self.realtime: + return - self.end_str = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d') - self.end = datetime.datetime.strptime(self.end_str, '%Y-%m-%d') - self.start = self.end + datetime.timedelta(days=int(before_day)) - self.start_str = datetime.datetime.strftime(self.start, '%Y-%m-%d') + self.end_str = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d') + self.end = datetime.datetime.strptime(self.end_str, '%Y-%m-%d') + self.start = self.end + datetime.timedelta(days=int(before_day)) + self.start_str = datetime.datetime.strftime(self.start, '%Y-%m-%d') - def split(self): - split_list = [] - new_end = self.end + def split(self): + split_list = [] + new_end = self.end - while new_end > self.start: - new_config = copy.deepcopy(self) + while new_end > self.start: + new_config = copy.deepcopy(self) - new_config.end = new_end - new_end = new_end + datetime.timedelta(days=-1) - new_config.start = new_end + new_config.end = new_end + new_end = new_end + datetime.timedelta(days=-1) + new_config.start = new_end - new_config.start_str = new_config.start.strftime('%Y-%m-%d') - new_config.end_str = new_config.end.strftime('%Y-%m-%d') + new_config.start_str = new_config.start.strftime('%Y-%m-%d') + new_config.end_str = new_config.end.strftime('%Y-%m-%d') - split_list.append(new_config) + split_list.append(new_config) - return split_list + return split_list diff --git a/WebBasedCrawler/twitter/twdbhelper.py b/WebBasedCrawler/twitter/twdbhelper.py index e9a3fb9..91ebb0c 100644 --- a/WebBasedCrawler/twitter/twdbhelper.py +++ b/WebBasedCrawler/twitter/twdbhelper.py @@ -3,81 +3,81 @@ import queue class TwitterDBHelper: - pymysql = __import__('pymysql.cursors') - DB_DUMP_SIZE = 128 + pymysql = __import__('pymysql.cursors') + DB_DUMP_SIZE = 128 - def __init__(self): - self.tweets = [] - self.buffer = [] - self.queue = queue.Queue() - pass + def __init__(self): + self.tweets = [] + self.buffer = [] + self.queue = queue.Queue() + pass - def __del__(self): - self.flush() - pass + def __del__(self): + self.flush() + pass - def get_param(self, keyword_id): - query = "select * from keyword where id = " + str(keyword_id) - params = [] - try: - conn = self.pymysql.connect(host='bigbird.iptime.org', - user='admin', passwd='admin123', - db='concepters', charset='utf8', - cursorclass=self.pymysql.cursors.DictCursor) + def get_param(self, keyword_id): + query = "select * from keyword where id = " + str(keyword_id) + params = [] + try: + conn = self.pymysql.connect(host='bigbird.iptime.org', + user='admin', passwd='admin123', + db='concepters', charset='utf8', + cursorclass=self.pymysql.cursors.DictCursor) - with conn.cursor() as cursor: - cursor.execute(query) - params = cursor.fetchone() + with conn.cursor() as cursor: + cursor.execute(query) + params = cursor.fetchone() - except Exception as e: - print(e) - exit(1) + except Exception as e: + print(e) + exit(1) - else: - conn.close() + else: + conn.close() - return params + return params - def flush(self): - local_buffer = [] - while not self.queue.empty(): - local_buffer.append(self.queue.get()) + def flush(self): + local_buffer = [] + while not self.queue.empty(): + local_buffer.append(self.queue.get()) - print('### db queue dump {}'.format(len(local_buffer))) + print('### db queue dump {}'.format(len(local_buffer))) - if len(local_buffer) > 0: - while True: - try: - conn = self.pymysql.connect(host='bigbird.iptime.org', - user='admin', passwd='admin123', - db='concepters', charset='utf8', - cursorclass=self.pymysql.cursors.DictCursor, - connect_timeout=5) + if len(local_buffer) > 0: + while True: + try: + conn = self.pymysql.connect(host='bigbird.iptime.org', + user='admin', passwd='admin123', + db='concepters', charset='utf8', + cursorclass=self.pymysql.cursors.DictCursor, + connect_timeout=5) - except Exception as e: - print(e) - continue + except Exception as e: + print(e) + continue - else: - break + else: + break - try: - with conn.cursor() as cursor: - for tweet, _db_num in local_buffer: - if not tweet.is_reply: - query = tweet.get_delete_query(_db_num) - cursor.execute(query) - query = tweet.get_insert_query(conn, _db_num) - cursor.execute(query) - conn.commit() + try: + with conn.cursor() as cursor: + for tweet, _db_num in local_buffer: + if not tweet.is_reply: + query = tweet.get_delete_query(_db_num) + cursor.execute(query) + query = tweet.get_insert_query(conn, _db_num) + cursor.execute(query) + conn.commit() - except Exception as e: - print(e) + except Exception as e: + print(e) - finally: - conn.close() + finally: + conn.close() - def insert_tweet(self, tweet: Tweet = None, db_num: int = -1, flush=False): - self.queue.put((tweet, db_num)) - if self.queue.qsize() >= self.DB_DUMP_SIZE: - self.flush() + def insert_tweet(self, tweet: Tweet = None, db_num: int = -1, flush=False): + self.queue.put((tweet, db_num)) + if self.queue.qsize() >= self.DB_DUMP_SIZE: + self.flush() diff --git a/WebBasedCrawler/twitter/tweet.py b/WebBasedCrawler/twitter/tweet.py index c5d0d2c..5a81759 100644 --- a/WebBasedCrawler/twitter/tweet.py +++ b/WebBasedCrawler/twitter/tweet.py @@ -3,22 +3,22 @@ from base.dbdata import DataDBRow class Tweet(DataDBRow): - def __init__(self): - super(self.__class__, self).__init__() + def __init__(self): + super(self.__class__, self).__init__() - self.tweet_id = None - self.user_id = None - self.user_name = None - self.text = None - self.created_at = None - self.retweets = 0 - self.favorites = 0 + self.tweet_id = None + self.user_id = None + self.user_name = None + self.text = None + self.created_at = None + self.retweets = 0 + self.favorites = 0 - self.is_reply = False - self.reply_cnt = 0 - self.retweet_cnt = 0 - self.favorite_cnt = 0 - self.top_link = None - self.tweet_link = None + self.is_reply = False + self.reply_cnt = 0 + self.retweet_cnt = 0 + self.favorite_cnt = 0 + self.top_link = None + self.tweet_link = None - self.depth = 0 + self.depth = 0 diff --git a/WebBasedCrawler/twitter/twparser.py b/WebBasedCrawler/twitter/twparser.py index 257f964..91b9511 100644 --- a/WebBasedCrawler/twitter/twparser.py +++ b/WebBasedCrawler/twitter/twparser.py @@ -7,90 +7,90 @@ import pytz class TweetParser: - @staticmethod - def parse(tag, keyword_id, depth=0, top_tw: Tweet=None): - tweet = Tweet() + @staticmethod + def parse(tag, keyword_id, depth=0, top_tw: Tweet=None): + tweet = Tweet() - tweet.tweet_id = int(tag.attrs['data-tweet-id']) + tweet.tweet_id = int(tag.attrs['data-tweet-id']) - nickname_tag = tag.select('strong.fullname')[0] - tweet.user_name = '' - for child in nickname_tag.children: - if isinstance(child, bs4.element.NavigableString): - if len(tweet.user_name) > 0: - tweet.user_name += ' ' - tweet.user_name += child - tweet.user_id = tag.select('span.username')[0].text[1:] - tweet.text = tag.select('p.tweet-text')[0].text + nickname_tag = tag.select('strong.fullname')[0] + tweet.user_name = '' + for child in nickname_tag.children: + if isinstance(child, bs4.element.NavigableString): + if len(tweet.user_name) > 0: + tweet.user_name += ' ' + tweet.user_name += child + tweet.user_id = tag.select('span.username')[0].text[1:] + tweet.text = tag.select('p.tweet-text')[0].text - # time_str = tag.select('a.tweet-timestamp')[0].attrs['title'] - # english - # tweet.created_at = datetime.datetime.strptime(time_str, '%I:%M %p - %d %b %Y') - # korean - # time_str = time_str.replace('오전', 'AM').replace('오후', 'PM') - # tweet.created_at = datetime.datetime.strptime(time_str, '%p %I:%M - %Y년 %m월 %d일') + # time_str = tag.select('a.tweet-timestamp')[0].attrs['title'] + # english + # tweet.created_at = datetime.datetime.strptime(time_str, '%I:%M %p - %d %b %Y') + # korean + # time_str = time_str.replace('오전', 'AM').replace('오후', 'PM') + # tweet.created_at = datetime.datetime.strptime(time_str, '%p %I:%M - %Y년 %m월 %d일') - timestamp = int(tag.select('span._timestamp')[0].attrs['data-time']) - utc_dt = datetime.datetime.utcfromtimestamp(timestamp) - local_tz = pytz.timezone('Asia/Seoul') - local_dt = utc_dt.replace(tzinfo=pytz.utc).astimezone(local_tz) - tweet.created_at = local_tz.normalize(local_dt) + timestamp = int(tag.select('span._timestamp')[0].attrs['data-time']) + utc_dt = datetime.datetime.utcfromtimestamp(timestamp) + local_tz = pytz.timezone('Asia/Seoul') + local_dt = utc_dt.replace(tzinfo=pytz.utc).astimezone(local_tz) + tweet.created_at = local_tz.normalize(local_dt) - reply_tag = tag.select('div.ReplyingToContextBelowAuthor') - tweet.is_reply = len(reply_tag) > 0 + reply_tag = tag.select('div.ReplyingToContextBelowAuthor') + tweet.is_reply = len(reply_tag) > 0 - reply_cnt_tag = tag.select('span.ProfileTweet-action--reply > span.ProfileTweet-actionCount') - if len(reply_cnt_tag) > 0: - tweet.reply_cnt = int(reply_cnt_tag[0].attrs['data-tweet-stat-count']) + reply_cnt_tag = tag.select('span.ProfileTweet-action--reply > span.ProfileTweet-actionCount') + if len(reply_cnt_tag) > 0: + tweet.reply_cnt = int(reply_cnt_tag[0].attrs['data-tweet-stat-count']) - retweet_cnt_tag = tag.select('span.ProfileTweet-action--retweet > span.ProfileTweet-actionCount') - if len(retweet_cnt_tag) > 0: - tweet.retweet_cnt = int(retweet_cnt_tag[0].attrs['data-tweet-stat-count']) + retweet_cnt_tag = tag.select('span.ProfileTweet-action--retweet > span.ProfileTweet-actionCount') + if len(retweet_cnt_tag) > 0: + tweet.retweet_cnt = int(retweet_cnt_tag[0].attrs['data-tweet-stat-count']) - favorite_cnt_tag = tag.select('span.ProfileTweet-action--favorite > span.ProfileTweet-actionCount') - if len(favorite_cnt_tag) > 0: - tweet.favorites_cnt = int(favorite_cnt_tag[0].attrs['data-tweet-stat-count']) + favorite_cnt_tag = tag.select('span.ProfileTweet-action--favorite > span.ProfileTweet-actionCount') + if len(favorite_cnt_tag) > 0: + tweet.favorites_cnt = int(favorite_cnt_tag[0].attrs['data-tweet-stat-count']) - link_tag = tag.select('a.js-permalink') - if len(link_tag) > 0: - tweet.tweet_link = TwitterConfig.protocol + '://' + TwitterConfig.top_url + link_tag[0].attrs['href'] - tweet.top_link = top_tw.tweet_link if top_tw else tweet.tweet_link + link_tag = tag.select('a.js-permalink') + if len(link_tag) > 0: + tweet.tweet_link = TwitterConfig.protocol + '://' + TwitterConfig.top_url + link_tag[0].attrs['href'] + tweet.top_link = top_tw.tweet_link if top_tw else tweet.tweet_link - tweet.depth = depth + tweet.depth = depth - tweet.platform_name = 'twitter' - tweet.platform_form = 'post' - tweet.platform_title = top_tw.user_id if top_tw else tweet.user_id - tweet.article_form = 'body' if tweet.depth is 0 else 'reply' - # tweet.article_parent = None - tweet.article_id = tweet.user_id - tweet.article_nickname = tweet.user_name - # tweet.article_title = None - tweet.article_data = tweet.text - tweet.article_url = tweet.top_link - # tweet.article_hit = 0 - tweet.article_date = tweet.created_at - tweet.article_order = tweet.depth - # tweet.article_profile = tweet.user_name - tweet.article_profileurl = TwitterConfig.protocol + '://' + TwitterConfig.top_url + '/' + tweet.user_id - tweet.platform_id = top_tw.user_id if top_tw else tweet.user_id - tweet.keyword_id = keyword_id - tweet.reply_url = tweet.tweet_link - # tweet.etc = '' + tweet.platform_name = 'twitter' + tweet.platform_form = 'post' + tweet.platform_title = top_tw.user_id if top_tw else tweet.user_id + tweet.article_form = 'body' if tweet.depth is 0 else 'reply' + # tweet.article_parent = None + tweet.article_id = tweet.user_id + tweet.article_nickname = tweet.user_name + # tweet.article_title = None + tweet.article_data = tweet.text + tweet.article_url = tweet.top_link + # tweet.article_hit = 0 + tweet.article_date = tweet.created_at + tweet.article_order = tweet.depth + # tweet.article_profile = tweet.user_name + tweet.article_profileurl = TwitterConfig.protocol + '://' + TwitterConfig.top_url + '/' + tweet.user_id + tweet.platform_id = top_tw.user_id if top_tw else tweet.user_id + tweet.keyword_id = keyword_id + tweet.reply_url = tweet.tweet_link + # tweet.etc = '' - return tweet + return tweet - @staticmethod - def get_lone_container(soup, parent_tw): - lone_tweets = soup.select('div.ThreadedConversation--loneTweet') - container_tags = [] - for tag in reversed(lone_tweets): - li = tag.select('li.stream-item') - if len(li) > 0 and 'data-item-id' in li[0].attrs: - tweet_id = int(li[0].attrs['data-item-id']) - if tweet_id == parent_tw.tweet_id: - break + @staticmethod + def get_lone_container(soup, parent_tw): + lone_tweets = soup.select('div.ThreadedConversation--loneTweet') + container_tags = [] + for tag in reversed(lone_tweets): + li = tag.select('li.stream-item') + if len(li) > 0 and 'data-item-id' in li[0].attrs: + tweet_id = int(li[0].attrs['data-item-id']) + if tweet_id == parent_tw.tweet_id: + break - container_tags.append(tag) + container_tags.append(tag) - return reversed(container_tags) + return reversed(container_tags)