Merge branch 'working/twitter'

This commit is contained in:
mjjo
2017-07-27 11:33:10 +09:00
29 changed files with 1221 additions and 15 deletions

View File

@@ -32,6 +32,7 @@ def is_debugger_attached():
is_debug = is_debugger_attached()
def printl(*objects, sep=' ', end='\n', file=None, flush=True):
if is_debug:
cur_frame = inspect.currentframe()

View File

@@ -0,0 +1,79 @@
from pymysql.connections import Connection
import datetime
from numbers import Number
class DataDBRow:
def __init__(self):
self.platform_name = None
self.platform_form = None
self.platform_title = None
self.article_form = None
self.article_parent = None
self.article_id = None
self.article_nickname = None
self.article_title = None
self.article_data = None
self.article_url = None
self.article_hit = 0
self.article_date = None
self.article_order = 0
self.article_profile = None
self.article_profileurl = None
self.platform_id = None
self.keyword_id = -1
self.reply_url = None
self.etc = None
def get_keys(self):
inst = DataDBRow()
keys = ()
for key, value_type in inst.__dict__.items():
if key.startswith('__') or callable(value_type):
continue
keys += key,
return keys
def get_values(self, conn, db_num):
inst = DataDBRow()
values = ()
for key, value_type in inst.__dict__.items():
if key.startswith('__') or callable(value_type):
continue
value = self.__dict__[key]
if isinstance(value, Number):
values += str(value),
elif isinstance(value, str):
values += conn.escape(value.encode('utf8').decode('utf8')),
else:
values += conn.escape(value),
return values
def get_insert_query(self, conn, db_num):
inst = DataDBRow()
keys = ''
values = ''
for key, value_type in inst.__dict__.items():
if key.startswith('__') or callable(value_type):
continue
if len(keys) > 0:
keys += ', '
values += ', '
keys += key
value = self.__dict__[key]
if isinstance(value, Number):
values += str(value)
elif isinstance(value, str):
values += conn.escape(value.encode('utf8').decode('utf8'))
else:
values += conn.escape(value)
query = 'insert into data_{} ({}) values ({})'.format(db_num, keys, values)
return query

View File

@@ -97,6 +97,31 @@ def get_driver(platform, proxies):
else:
return platform_webdriver[platform](capabilities=desired_capabilities)
_expired_proxies = []
def set_proxy_expired(proxy):
if proxy not in _expired_proxies:
_expired_proxies.append(proxy)
address = proxy['http'][len('http://'):]
with open(proxy_filename, 'r') as f:
lines = f.readlines()
expired_idx = -1
for idx, line in enumerate(lines):
if line.startswith(address):
expired_idx = idx
break
if expired_idx >= 0:
lines[expired_idx] = '# ' + lines[expired_idx]
lines.append(lines.pop(expired_idx))
with open(proxy_filename, 'w') as f:
f.writelines(lines)
def get_proxy_from_file(filename):
"""
@@ -104,7 +129,7 @@ def get_proxy_from_file(filename):
:return (ip, port): string, string
if ip, port or filename is invalid, return (None, None)
"""
proxy_lists = [line.replace('\n', '') for line in open(filename) if re_ip.search(line)]
proxy_lists = [line.replace('\n', '') for line in open(filename) if not line.strip().startswith('#') and re_ip.search(line)]
if proxy_lists:
m = re_ip.search(proxy_lists[random.randint(0, len(proxy_lists) - 1)])
if m:

View File

@@ -0,0 +1,3 @@
requests
bs4
pytz

View File

View File

@@ -0,0 +1,62 @@
import datetime
import copy
class TwitterConfig:
protocol = 'https'
top_url = 'twitter.com'
search_url = '/i/search/timeline'
conversation_url_form = '/i/{}/conversation/{}'
def __init__(self):
self.keyword_id = -1
self.db_num = -1
self.id = 0
self.realtime = False
self.keywords = []
self.start_str = None
self.start = None
self.end_str = None
self.end = None
self.authorship = None
self.state = None
self.platform = None
def set_param(self, keyword_id, db_num, params):
self.keyword_id = int(keyword_id)
self.db_num = int(db_num)
self.id = int(params['id'])
self.realtime = params['realtime'] == '1'
self.keywords = []
for keyword in params['searches'].split(','):
self.keywords.append(keyword.strip())
self.start_str = str(params['start'])
self.end_str = str(params['end'])
self.start = datetime.datetime.combine(params['start'], datetime.datetime.min.time())
self.end = datetime.datetime.combine(params['end'], datetime.datetime.min.time())
self.authorship = params['authorship']
self.state = params['state']
self.platform = params['platform']
def split(self):
split_list = []
new_end = self.end
while new_end > self.start:
new_config = copy.deepcopy(self)
new_config.end = new_end
new_end = new_end + datetime.timedelta(days=-1)
new_config.start = new_end
new_config.start_str = new_config.start.strftime('%Y-%m-%d')
new_config.end_str = new_config.end.strftime('%Y-%m-%d')
split_list.append(new_config)
return split_list

View File

@@ -0,0 +1,79 @@
from twitter.tweet import Tweet
import multiprocessing as mp
class TwitterDBHelper:
pymysql = __import__('pymysql.cursors')
def __init__(self):
self.tweets = []
self.buffer = []
self.lock = mp.Lock()
pass
def __del__(self):
pass
def get_param(self, keyword_id):
query = "select * from keyword where id = " + str(keyword_id)
params = []
try:
conn = self.pymysql.connect(host='bigbird.iptime.org',
user='admin', passwd='admin123',
db='concepters', charset='utf8',
cursorclass=self.pymysql.cursors.DictCursor)
with conn.cursor() as cursor:
cursor.execute(query)
params = cursor.fetchone()
except Exception as e:
print(e)
exit(1)
else:
conn.close()
return params
def insert_tweet(self, tweet: Tweet = None, db_num: int = -1, flush=False):
# self.lock.acquire()
# if tweet is not None:
# self.buffer.append((tweet, db_num, ))
#
# local_buffer = None
# if len(self.buffer) >= 100 or flush:
# local_buffer = copy.deepcopy(self.buffer)
# self.buffer.clear()
# self.lock.release()
local_buffer = [(tweet, db_num, )]
if local_buffer:
while True:
try:
conn = self.pymysql.connect(host='bigbird.iptime.org',
user='admin', passwd='admin123',
db='concepters', charset='utf8',
cursorclass=self.pymysql.cursors.DictCursor,
connect_timeout=5)
except Exception as e:
print(e)
continue
else:
break
try:
with conn.cursor() as cursor:
for tweet, _db_num in local_buffer:
query = tweet.get_insert_query(conn, _db_num)
cursor.execute(query)
conn.commit()
except Exception as e:
print(e)
finally:
conn.close()

View File

@@ -0,0 +1,24 @@
from base.dbdata import DataDBRow
class Tweet(DataDBRow):
def __init__(self):
super(self.__class__, self).__init__()
self.tweet_id = None
self.user_id = None
self.user_name = None
self.text = None
self.created_at = None
self.retweets = 0
self.favorites = 0
self.is_reply = False
self.reply_cnt = 0
self.retweet_cnt = 0
self.favorite_cnt = 0
self.top_link = None
self.tweet_link = None
self.depth = 0

View File

@@ -0,0 +1,289 @@
from twitter.twconfig import TwitterConfig
from twitter.twdbhelper import TwitterDBHelper
from twitter.tweet import Tweet
from twitter.twparser import TweetParser
import base.proxy
import base.baseclasses
import requests
import bs4
import json
import urllib
import threading
import queue
import time
class TwitterCrawler():
def __init__(self):
self.default_config = TwitterConfig()
self.db_helper = TwitterDBHelper()
def set_arguments(self, browser, keyword_id, db_num, before_day, until_page):
params = self.db_helper.get_param(keyword_id)
self.default_config.set_param(keyword_id, db_num, params)
@staticmethod
def get_timeline_url(query, start_str, end_str, max_position=''):
params = {
'f': 'tweets',
'vertical': 'default',
'src': 'typd',
'q': '{} since:{} until:{}'.format(query, start_str, end_str),
'language': 'en',
'max_position': max_position,
}
url_tupple = (TwitterConfig.protocol, TwitterConfig.top_url, TwitterConfig.search_url, '', urllib.parse.urlencode(params), '')
return urllib.parse.urlunparse(url_tupple)
@staticmethod
def get_content_url(user_id, tweet_id, max_position=''):
params = {
'max_position': max_position,
}
sub_url = TwitterConfig.conversation_url_form.format(user_id, tweet_id)
url_tupple = (TwitterConfig.protocol, TwitterConfig.top_url, sub_url, '', urllib.parse.urlencode(params), '')
return urllib.parse.urlunparse(url_tupple)
@staticmethod
def get_page(url, proc_id):
headers = {
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36',
'Accept-Language': 'ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4',
}
# if proxies is None:
proxies = base.proxy.get_proxy_for_requests()
resp = None
while True:
try:
resp = requests.get(url, headers=headers, proxies=proxies, timeout=3)
except Exception as e:
if proxies == (None, None):
break
print('[{}] proxy {} is expired. ({})'.format(proc_id, proxies, e))
base.proxy.set_proxy_expired(proxies)
proxies = base.proxy.get_proxy_for_requests()
else:
break
return resp
def runner_proc(self, proc_id, content_queue, result_queue, config):
print('{} to {} runner thread start'.format(config.start_str, config.end_str))
b_continue = True
min_tweet_id = None
max_tweet_id = None
max_position = ''
tweet_count = 0
while b_continue:
if min_tweet_id is not None:
max_position = 'TWEET-{}-{}'.format(max_tweet_id, min_tweet_id)
url = self.get_timeline_url(config.keywords[0], config.start_str, config.end_str, max_position)
resp = self.get_page(url, proc_id)
if resp is None:
break
j = json.loads(resp.content.decode('utf-8'))
soup = bs4.BeautifulSoup(j['items_html'], 'lxml')
tweet_tags = soup.select("div.tweet")
for tw in tweet_tags:
tweet = TweetParser.parse(tw, config.keyword_id)
if tweet.is_reply is True:
# print(' ## {}: {}...'.format(tweet.user_name, tweet.text[:20]))
continue
if tweet.reply_cnt > 0:
self.insert_content_pool(proc_id, content_queue, tweet, tweet)
self.db_helper.insert_tweet(tweet, config.db_num)
# print('{} {}: {}...'.format(tweet.created_at, tweet.user_name, tweet.text[:20]))
count = len(tweet_tags)
if count == 0:
break
if min_tweet_id is None:
min_tweet_id = tweet_tags[0].attrs['data-item-id']
max_tweet_id = tweet_tags[-1].attrs['data-item-id']
tweet_count += count
print('{} to {} runner thread finished {}'.format(config.start_str, config.end_str, tweet_count))
result_queue.put((proc_id, tweet_count, ))
# self.runner_processing[proc_id].value = False
return proc_id, tweet_count,
@staticmethod
def insert_content_pool(proc_id: int, qu, tweet: Tweet, tweet_top: Tweet):
# print(' [{}] pool insert: {} ({})'.format(proc_id, tweet.text[:20] if tweet.text else '', tweet.tweet_link))
qu.put((tweet, tweet_top,))
@staticmethod
def get_content(content_queue):
sleep_time = time.time()
while True:
try:
parent_tw, top_tw, = content_queue.get(block=True, timeout=2)
except Exception as e:
if time.time()-sleep_time > 60:
break
else:
continue
else:
return parent_tw, top_tw,
return None, None,
def content_proc(self, proc_id, content_queue, result_queue):
print('[{}] content thread start'.format(proc_id))
tweet_count = 0
while True:
parent_tw, top_tw, = self.get_content(content_queue)
if not parent_tw:
break
# print(' [{}] <<< parent : {} ({})'.format(proc_id, parent_tw.text[:20], parent_tw.tweet_link))
max_position = ''
b_continue = True
while b_continue:
url = self.get_content_url(parent_tw.user_id, parent_tw.tweet_id, max_position)
resp = self.get_page(url, proc_id)
if resp is None or resp.status_code == 404:
break
elif resp.status_code != 200:
print('[WARNING] content_get code {}'.format(resp.status_code))
continue
j = json.loads(resp.content.decode('utf-8'))
soup = bs4.BeautifulSoup(j['items_html'], 'lxml')
reply_container_tags = soup.select('li.ThreadedConversation')
reply_container_tags += TweetParser.get_lone_container(soup, parent_tw)
for container_tags in reply_container_tags:
tweet_tags = container_tags.select('div.tweet')
if len(tweet_tags) > 0:
tweet = TweetParser.parse(tweet_tags[0], self.default_config.keyword_id, parent_tw.depth+1, top_tw)
# print('[{}]>>> {} {}: {} ({}) ({})'.format(proc_id, tweet.created_at, tweet.user_name, tweet.text[:20], tweet.depth, tweet.tweet_link))
self.insert_content_pool(proc_id, content_queue, tweet, top_tw)
self.db_helper.insert_tweet(tweet, self.default_config.db_num)
tweet_count += 1
b_continue = j['has_more_items']
if b_continue:
max_position = j['min_position']
result_queue.put((proc_id, tweet_count))
print('[{}] content thread finished'.format(proc_id))
return proc_id, tweet_count,
def debug_content(self):
content_qu = queue.Queue()
runner_result_qu = queue.Queue()
content_result_qu = queue.Queue()
test_tw = Tweet()
# test_tw.tweet_link = 'https://twitter.com/yniold_/status/886863893137678337'
# test_tw.user_id = 'yniold_'
# test_tw.tweet_id = 886863893137678337
test_tw.tweet_link = 'https://twitter.com/Awesome_vely/status/888704413111435264'
test_tw.user_id = 'Awesome_vely'
test_tw.tweet_id = 888704413111435264
test_tw.text = '시작'
self.insert_content_pool(0, content_qu, test_tw, test_tw)
content_threads = [threading.Thread(target=self.content_proc, args=(proc_id, content_qu, content_result_qu)) for proc_id in range(16)]
[th.start() for th in content_threads]
[th.join() for th in content_threads]
while not content_result_qu.empty():
res = content_result_qu.get()
print('reply : {}'.format(res))
print('end all')
def test_insert_db(self):
test_tw = Tweet()
test_tw.tweet_link = 'https://twitter.com/moonriver365/status/885797401033818112'
test_tw.user_id = 'moonriver365'
test_tw.tweet_id = 885797401033818112
for _ in range(5):
self.db_helper.insert_tweet(test_tw, self.default_config.db_num)
def debug(self):
if base.baseclasses.is_debug:
## check proxy
# base.proxy.get_proxy_from_file('proxy.txt')
# proxy = {'https': 'http://45.56.86.93:3128', 'http': 'http://45.56.86.93:3128'}
# base.proxy.set_proxy_expired(proxy)
# return
## contents check
self.debug_content()
# split_config = self.default_config.split()
# self.test_insert_db()
print("debug end")
# exit()
def start(self):
start_time = time.time()
# self.debug()
# return
# run
split_config = self.default_config.split()
content_qu = queue.Queue()
runner_result_qu = queue.Queue()
content_result_qu = queue.Queue()
runner_threads = [threading.Thread(target=self.runner_proc, args=(proc_id, content_qu, runner_result_qu, config)) for proc_id, config in enumerate(split_config)]
content_threads = [threading.Thread(target=self.content_proc, args=(proc_id, content_qu, content_result_qu)) for proc_id in range(16)]
[th.start() for th in runner_threads]
[th.start() for th in content_threads]
[th.join() for th in runner_threads]
[th.join() for th in content_threads]
# rerun zero runners
runner_threads = []
runner_result_qu2 = queue.Queue()
idx = 0
while not runner_result_qu.empty():
res = runner_result_qu.get()
if res == 0:
th = threading.Thread(target=self.runner_proc, args=(idx, content_qu, runner_result_qu2, split_config[idx]))
runner_threads.append(th)
idx += 1
content_threads = [threading.Thread(target=self.content_proc, args=(proc_id, content_qu, content_result_qu)) for proc_id in range(16)]
[th.start() for th in runner_threads]
[th.start() for th in content_threads]
[th.join() for th in runner_threads]
[th.join() for th in content_threads]
# print running time
delta = time.time() - start_time
m, s = divmod(delta, 60)
h, m = divmod(m, 60)
print("finished all {}:{:02d}:{:02d} ".format(int(h), int(m), int(s)))

View File

@@ -0,0 +1,96 @@
from twitter.tweet import Tweet
from twitter.twconfig import TwitterConfig
import bs4
import datetime
import pytz
class TweetParser:
@staticmethod
def parse(tag, keyword_id, depth=0, top_tw: Tweet=None):
tweet = Tweet()
tweet.tweet_id = int(tag.attrs['data-tweet-id'])
nickname_tag = tag.select('strong.fullname')[0]
tweet.user_name = ''
for child in nickname_tag.children:
if isinstance(child, bs4.element.NavigableString):
if len(tweet.user_name) > 0:
tweet.user_name += ' '
tweet.user_name += child
tweet.user_id = tag.select('span.username')[0].text[1:]
tweet.text = tag.select('p.tweet-text')[0].text
# time_str = tag.select('a.tweet-timestamp')[0].attrs['title']
# english
# tweet.created_at = datetime.datetime.strptime(time_str, '%I:%M %p - %d %b %Y')
# korean
# time_str = time_str.replace('오전', 'AM').replace('오후', 'PM')
# tweet.created_at = datetime.datetime.strptime(time_str, '%p %I:%M - %Y년 %m월 %d일')
timestamp = int(tag.select('span._timestamp')[0].attrs['data-time'])
utc_dt = datetime.datetime.utcfromtimestamp(timestamp)
local_tz = pytz.timezone('Asia/Seoul')
local_dt = utc_dt.replace(tzinfo=pytz.utc).astimezone(local_tz)
tweet.created_at = local_tz.normalize(local_dt)
reply_tag = tag.select('div.ReplyingToContextBelowAuthor')
tweet.is_reply = len(reply_tag) > 0
reply_cnt_tag = tag.select('span.ProfileTweet-action--reply > span.ProfileTweet-actionCount')
if len(reply_cnt_tag) > 0:
tweet.reply_cnt = int(reply_cnt_tag[0].attrs['data-tweet-stat-count'])
retweet_cnt_tag = tag.select('span.ProfileTweet-action--retweet > span.ProfileTweet-actionCount')
if len(retweet_cnt_tag) > 0:
tweet.retweet_cnt = int(retweet_cnt_tag[0].attrs['data-tweet-stat-count'])
favorite_cnt_tag = tag.select('span.ProfileTweet-action--favorite > span.ProfileTweet-actionCount')
if len(favorite_cnt_tag) > 0:
tweet.favorites_cnt = int(favorite_cnt_tag[0].attrs['data-tweet-stat-count'])
link_tag = tag.select('a.js-permalink')
if len(link_tag) > 0:
tweet.tweet_link = TwitterConfig.protocol + '://' + TwitterConfig.top_url + link_tag[0].attrs['href']
tweet.top_link = top_tw.tweet_link if top_tw else tweet.tweet_link
tweet.depth = depth
tweet.platform_name = 'twitter'
tweet.platform_form = 'post'
tweet.platform_title = top_tw.user_id if top_tw else tweet.user_id
tweet.article_form = 'body' if tweet.depth is 0 else 'reply'
# tweet.article_parent = None
tweet.article_id = tweet.user_id
tweet.article_nickname = tweet.user_name
# tweet.article_title = None
tweet.article_data = tweet.text
tweet.article_url = tweet.top_link
# tweet.article_hit = 0
tweet.article_date = tweet.created_at
tweet.article_order = tweet.depth
# tweet.article_profile = tweet.user_name
tweet.article_profileurl = TwitterConfig.protocol + '://' + TwitterConfig.top_url + '/' + tweet.user_id
tweet.platform_id = top_tw.user_id if top_tw else tweet.user_id
tweet.keyword_id = keyword_id
tweet.reply_url = tweet.tweet_link
# tweet.etc = ''
return tweet
@staticmethod
def get_lone_container(soup, parent_tw):
lone_tweets = soup.select('div.ThreadedConversation--loneTweet')
container_tags = []
for tag in reversed(lone_tweets):
li = tag.select('li.stream-item')
if len(li) > 0 and 'data-item-id' in li[0].attrs:
tweet_id = int(li[0].attrs['data-item-id'])
if tweet_id == parent_tw.tweet_id:
break
container_tags.append(tag)
return reversed(container_tags)

View File

@@ -11,6 +11,8 @@ from kakao import kakaocrawl
from naver import navercrawl
from facebook import facebookcrawl
from facebook import facebookcrawlbs
from twitter import twittercrawl
from youtube import youtubecrawl
from base.baseclasses import print_and_flush
@@ -26,8 +28,12 @@ class WebBasedCrawler:
self.crawler = kakaocrawl.KakaoMainCrawler()
elif platform == "navercafe":
self.crawler = navercrawl.NaverCafeMainAreaCrawler()
elif platform == "facebook":
elif platform == 'facebook':
self.crawler = facebookcrawlbs.FacebookMainCrawler()
elif platform == 'twitter':
self.crawler = twittercrawl.TwitterCrawler()
elif platform == 'youtube':
self.crawler = youtubecrawl.YoutubeMainCrawler()
else:
self.crawler = None
raise Exception
@@ -38,7 +44,7 @@ class WebBasedCrawler:
browser_opt = ('chrome', "ie", "opera", "firefox")
platform_opt = ('instagram', 'kakaostory', 'navercafe', "facebook")
platform_opt = ('instagram', 'kakaostory', 'navercafe', 'facebook', 'twitter', 'youtube')
def get_browser_info(platform_, file_name="browser.txt"):
@@ -73,7 +79,7 @@ def get_browser_info(platform_, file_name="browser.txt"):
if __name__ == '__main__':
"""
sys.argv[0] webbasedcrawler.py
sys.argv[1] instagram, kakaochannel, navercafe, facebook
sys.argv[1] instagram, kakaochannel, navercafe, facebook, twitter, youtube
sys.argv[2] keyword_id
sys.argv[3] data group
sys.argv[4] start_day
@@ -85,8 +91,7 @@ if __name__ == '__main__':
else:
print_and_flush("Check Argumenets!")
exit(1)
crawler = WebBasedCrawler(get_browser_info(sys.argv[1]), sys.argv[1], sys.argv[2],
sys.argv[3], sys.argv[4], sys.argv[5])
crawler = WebBasedCrawler(get_browser_info(sys.argv[1]), sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5])
crawler.start()
print_and_flush("Finished Crawling :)")
exit(0)

View File

View File

@@ -0,0 +1,7 @@
class YoutubeMainCrawl:
def __init__(self):
pass
def start(self):
pass