- 트위터 크롤러 수정

- 중복 제거 후 insert
  - proxy.txt가 모두 만료되면 db 사용
  - proxy db에서 중복 제거해서 가져오기
  - 프록시 문제로 페이지 요청 시 0.1초 딜레이
  - 크롤러 stop 동작하도록
  - realtime 적용
This commit is contained in:
mjjo
2017-07-28 14:27:38 +09:00
parent 2973faaf39
commit 3d5e2d0c98
8 changed files with 267 additions and 198 deletions

View File

@@ -314,10 +314,10 @@ void Widget::StopButton()
SetCrawlingState("Stop"); SetCrawlingState("Stop");
m_strCrawlingID.clear(); m_strCrawlingID.clear();
//qDebug() << m_nPlatform; //qDebug() << m_nPlatform;
if(4 <= m_nPlatform && m_nPlatform <= 12) if(m_nPlatform < 4 || m_nPlatform == 13)
{ return;
m_pManage[m_nPlatform]->clossProcess();
} m_pManage[m_nPlatform]->clossProcess();
} }
void Widget::Update() void Widget::Update()
@@ -381,6 +381,10 @@ void Widget::RefreshButton()
case 11:str += ", Facebook Tag"; break; case 11:str += ", Facebook Tag"; break;
case 12:str += ", Facebook User"; break; case 12:str += ", Facebook User"; break;
case 13:str += ", Naver Blog Accuracy"; break; case 13:str += ", Naver Blog Accuracy"; break;
case 14:str += ", Twitter Tag"; break;
case 15:str += ", Twitter User"; break;
case 16:str += ", Youtube Tag"; break;
case 17:str += ", Youtube User"; break;
} }
m_pcb->addItem(str,query.value(7)); m_pcb->addItem(str,query.value(7));
} }

View File

@@ -52,6 +52,10 @@ class DataDBRow:
return values return values
def get_delete_query(self, db_num):
query = """delete from data_{} where article_url='{}'""".format(db_num, self.article_url)
return query
def get_insert_query(self, conn, db_num): def get_insert_query(self, conn, db_num):
inst = DataDBRow() inst = DataDBRow()

View File

@@ -101,6 +101,9 @@ _expired_proxies = []
def set_proxy_expired(proxy): def set_proxy_expired(proxy):
if not os.path.exists(proxy_filename) or not os.path.isfile(proxy_filename):
return
if proxy not in _expired_proxies: if proxy not in _expired_proxies:
_expired_proxies.append(proxy) _expired_proxies.append(proxy)
@@ -134,7 +137,7 @@ def get_proxy_from_file(filename):
m = re_ip.search(proxy_lists[random.randint(0, len(proxy_lists) - 1)]) m = re_ip.search(proxy_lists[random.randint(0, len(proxy_lists) - 1)])
if m: if m:
return m.group(1), m.group(2) return m.group(1), m.group(2)
return (None, None) return None, None
def get_proxy_from_db(): def get_proxy_from_db():
@@ -144,29 +147,33 @@ def get_proxy_from_db():
db='concepters', charset='utf8', db='concepters', charset='utf8',
cursorclass=pymysql.cursors.DictCursor) cursorclass=pymysql.cursors.DictCursor)
with conn.cursor() as cursor: with conn.cursor() as cursor:
cursor.execute("select * from Proxy") cursor.execute("select * from Proxy group by Proxy")
proxy_lists = [(str(i['Proxy']), str(i['Port'])) for i in cursor if i['Port']] proxy_lists = [(str(i['Proxy']), str(i['Port'])) for i in cursor if i['Proxy'] and i['Port']]
proxy_lists.sort()
conn.close() conn.close()
return proxy_lists[random.randint(0, len(proxy_lists) - 1)] if proxy_lists else (None, None) return proxy_lists[random.randint(0, len(proxy_lists) - 1)] if proxy_lists else (None, None)
except: except:
conn.close() return None, None
return (None, None)
def get_proxy(): def get_proxy():
if os.path.exists(proxy_filename) and os.path.isfile(proxy_filename): if os.path.exists(proxy_filename) and os.path.isfile(proxy_filename):
return get_proxy_from_file(proxy_filename) ip, port = get_proxy_from_file(proxy_filename)
if not ip or not port:
return get_proxy_from_db()
else:
return ip, port
else: else:
return get_proxy_from_db() return get_proxy_from_db()
def get_requests_proxy(proxies): def get_requests_proxy(proxies):
return {'http': 'http://' + proxies, 'https': 'http://' + proxies} return {
'http': 'http://{}'.format(proxies),
'https': 'https://{}'.format(proxies),
}
def get_proxy_for_requests(): def get_proxy_for_requests():
ip, port = get_proxy() ip, port = get_proxy()
return get_requests_proxy(ip + ":" + port) return get_requests_proxy(ip + ":" + port)

View File

@@ -27,7 +27,7 @@ class TwitterConfig:
self.db_num = int(db_num) self.db_num = int(db_num)
self.id = int(params['id']) self.id = int(params['id'])
self.realtime = params['realtime'] == '1' self.realtime = params['realtime'] == 1
self.keywords = [] self.keywords = []
for keyword in params['searches'].split(','): for keyword in params['searches'].split(','):
@@ -35,14 +35,22 @@ class TwitterConfig:
self.start_str = str(params['start']) self.start_str = str(params['start'])
self.end_str = str(params['end']) self.end_str = str(params['end'])
self.start = datetime.datetime.strptime(self.start_str, '%Y-%m-%d')
self.start = datetime.datetime.combine(params['start'], datetime.datetime.min.time()) self.end = datetime.datetime.strptime(self.end_str, '%Y-%m-%d')
self.end = datetime.datetime.combine(params['end'], datetime.datetime.min.time())
self.authorship = params['authorship'] self.authorship = params['authorship']
self.state = params['state'] self.state = params['state']
self.platform = params['platform'] self.platform = params['platform']
def reload_realtime(self, before_day):
if not self.realtime:
return
self.end_str = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d')
self.end = datetime.datetime.strptime(self.end_str, '%Y-%m-%d')
self.start = self.end + datetime.timedelta(days=int(before_day))
self.start_str = datetime.datetime.strftime(self.start, '%Y-%m-%d')
def split(self): def split(self):
split_list = [] split_list = []
new_end = self.end new_end = self.end

View File

@@ -68,6 +68,9 @@ class TwitterDBHelper:
try: try:
with conn.cursor() as cursor: with conn.cursor() as cursor:
for tweet, _db_num in local_buffer: for tweet, _db_num in local_buffer:
if not tweet.is_reply:
query = tweet.get_delete_query(_db_num)
cursor.execute(query)
query = tweet.get_insert_query(conn, _db_num) query = tweet.get_insert_query(conn, _db_num)
cursor.execute(query) cursor.execute(query)
conn.commit() conn.commit()

View File

@@ -20,9 +20,12 @@ class TwitterCrawler():
def __init__(self): def __init__(self):
self.default_config = TwitterConfig() self.default_config = TwitterConfig()
self.db_helper = TwitterDBHelper() self.db_helper = TwitterDBHelper()
self.proxy = None
self.before_day = None
def set_arguments(self, browser, keyword_id, db_num, before_day, until_page): def set_arguments(self, browser, keyword_id, db_num, before_day, until_page):
params = self.db_helper.get_param(keyword_id) params = self.db_helper.get_param(keyword_id)
self.before_day = before_day
self.default_config.set_param(keyword_id, db_num, params) self.default_config.set_param(keyword_id, db_num, params)
@staticmethod @staticmethod
@@ -49,26 +52,26 @@ class TwitterCrawler():
url_tupple = (TwitterConfig.protocol, TwitterConfig.top_url, sub_url, '', urllib.parse.urlencode(params), '') url_tupple = (TwitterConfig.protocol, TwitterConfig.top_url, sub_url, '', urllib.parse.urlencode(params), '')
return urllib.parse.urlunparse(url_tupple) return urllib.parse.urlunparse(url_tupple)
@staticmethod def get_page(self, url, proc_id):
def get_page(url, proc_id):
headers = { headers = {
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36', 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36',
'Accept-Language': 'ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4', 'Accept-Language': 'ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4',
} }
# if proxies is None: if not self.proxy:
proxies = base.proxy.get_proxy_for_requests() self.proxy = base.proxy.get_proxy_for_requests()
resp = None resp = None
while True: while True:
try: try:
resp = requests.get(url, headers=headers, proxies=proxies, timeout=3) time.sleep(0.1)
resp = requests.get(url, headers=headers, proxies=self.proxy, timeout=3)
except Exception as e: except Exception as e:
if proxies == (None, None): if self.proxy == (None, None):
break break
print('[{}] proxy {} is expired. ({})'.format(proc_id, proxies, e)) print('[{}] proxy {} is expired. ({})'.format(proc_id, self.proxy, e))
base.proxy.set_proxy_expired(proxies) base.proxy.set_proxy_expired(self.proxy)
proxies = base.proxy.get_proxy_for_requests() self.proxy = base.proxy.get_proxy_for_requests()
else: else:
break break
@@ -108,6 +111,7 @@ class TwitterCrawler():
self.db_helper.insert_tweet(tweet, config.db_num) self.db_helper.insert_tweet(tweet, config.db_num)
# print('{} {}: {}...'.format(tweet.created_at, tweet.user_name, tweet.text[:20])) # print('{} {}: {}...'.format(tweet.created_at, tweet.user_name, tweet.text[:20]))
print('body {} [{}]'.format(tweet.top_link, 'ok'))
count = len(tweet_tags) count = len(tweet_tags)
if count == 0: if count == 0:
@@ -119,7 +123,10 @@ class TwitterCrawler():
tweet_count += count tweet_count += count
print('{} to {} runner thread finished {}'.format(config.start_str, config.end_str, tweet_count)) print('{} to {} runner thread finished {}'.format(config.start_str, config.end_str, tweet_count))
result_queue.put((proc_id, tweet_count, )) result_queue.put({
'proc_id': proc_id,
'count': tweet_count,
})
# self.runner_processing[proc_id].value = False # self.runner_processing[proc_id].value = False
return proc_id, tweet_count, return proc_id, tweet_count,
@@ -135,7 +142,7 @@ class TwitterCrawler():
try: try:
parent_tw, top_tw, = content_queue.get(block=True, timeout=2) parent_tw, top_tw, = content_queue.get(block=True, timeout=2)
except Exception as e: except Exception as e:
if time.time()-sleep_time > 60: if time.time()-sleep_time > 15:
break break
else: else:
continue continue
@@ -177,6 +184,7 @@ class TwitterCrawler():
if len(tweet_tags) > 0: if len(tweet_tags) > 0:
tweet = TweetParser.parse(tweet_tags[0], self.default_config.keyword_id, parent_tw.depth+1, top_tw) tweet = TweetParser.parse(tweet_tags[0], self.default_config.keyword_id, parent_tw.depth+1, top_tw)
# print('[{}]>>> {} {}: {} ({}) ({})'.format(proc_id, tweet.created_at, tweet.user_name, tweet.text[:20], tweet.depth, tweet.tweet_link)) # print('[{}]>>> {} {}: {} ({}) ({})'.format(proc_id, tweet.created_at, tweet.user_name, tweet.text[:20], tweet.depth, tweet.tweet_link))
print('reply {} [{}]'.format(tweet.top_link, 'ok'))
self.insert_content_pool(proc_id, content_queue, tweet, top_tw) self.insert_content_pool(proc_id, content_queue, tweet, top_tw)
self.db_helper.insert_tweet(tweet, self.default_config.db_num) self.db_helper.insert_tweet(tweet, self.default_config.db_num)
tweet_count += 1 tweet_count += 1
@@ -185,7 +193,11 @@ class TwitterCrawler():
if b_continue: if b_continue:
max_position = j['min_position'] max_position = j['min_position']
result_queue.put((proc_id, tweet_count)) result_queue.put({
'proc_id': proc_id,
'count': tweet_count,
})
print('[{}] content thread finished'.format(proc_id)) print('[{}] content thread finished'.format(proc_id))
return proc_id, tweet_count, return proc_id, tweet_count,
@@ -242,18 +254,19 @@ class TwitterCrawler():
print("debug end") print("debug end")
# exit() # exit()
def start(self): def run(self):
start_time = time.time() start_time = time.time()
# self.debug()
# return
# run # run
split_config = self.default_config.split() split_config = self.default_config.split()
content_qu = queue.Queue() content_qu = queue.Queue()
runner_result_qu = queue.Queue() runner_result_qu = queue.Queue()
content_result_qu = queue.Queue() content_result_qu = queue.Queue()
runner_result_cnt = 0
content_result_cnt = 0
runner_threads = [threading.Thread(target=self.runner_proc, args=(proc_id, content_qu, runner_result_qu, config)) for proc_id, config in enumerate(split_config)] runner_threads = [threading.Thread(target=self.runner_proc, args=(proc_id, content_qu, runner_result_qu, config)) for proc_id, config in enumerate(split_config)]
content_threads = [threading.Thread(target=self.content_proc, args=(proc_id, content_qu, content_result_qu)) for proc_id in range(16)] content_threads = [threading.Thread(target=self.content_proc, args=(proc_id, content_qu, content_result_qu)) for proc_id in range(16)]
@@ -269,12 +282,17 @@ class TwitterCrawler():
idx = 0 idx = 0
while not runner_result_qu.empty(): while not runner_result_qu.empty():
res = runner_result_qu.get() res = runner_result_qu.get()
if res == 0: runner_result_cnt += res['count']
if res['count'] == 0:
th = threading.Thread(target=self.runner_proc, args=(idx, content_qu, runner_result_qu2, split_config[idx])) th = threading.Thread(target=self.runner_proc, args=(idx, content_qu, runner_result_qu2, split_config[idx]))
runner_threads.append(th) runner_threads.append(th)
idx += 1 idx += 1
content_threads = [threading.Thread(target=self.content_proc, args=(proc_id, content_qu, content_result_qu)) for proc_id in range(16)]
if len(runner_threads) > 0:
content_threads = [threading.Thread(target=self.content_proc, args=(proc_id, content_qu, content_result_qu)) for proc_id in range(16)]
else:
content_threads = []
[th.start() for th in runner_threads] [th.start() for th in runner_threads]
[th.start() for th in content_threads] [th.start() for th in content_threads]
@@ -282,8 +300,33 @@ class TwitterCrawler():
[th.join() for th in runner_threads] [th.join() for th in runner_threads]
[th.join() for th in content_threads] [th.join() for th in content_threads]
while not runner_result_qu2.empty():
res = runner_result_qu2.get()
runner_result_cnt += res['count']
while not content_result_qu.empty():
res = content_result_qu.get()
content_result_cnt += res['count']
print('total body count: {}'.format(runner_result_cnt))
print('total reply count: {}'.format(content_result_cnt))
# print running time # print running time
delta = time.time() - start_time delta = time.time() - start_time
m, s = divmod(delta, 60) m, s = divmod(delta, 60)
h, m = divmod(m, 60) h, m = divmod(m, 60)
print("finished all {}:{:02d}:{:02d} ".format(int(h), int(m), int(s))) print("finished all {}:{:02d}:{:02d} ".format(int(h), int(m), int(s)))
def start(self):
# self.debug()
# return
# run
while True:
self.default_config.reload_realtime(self.before_day)
self.run()
if not self.default_config.realtime:
break

View File

@@ -34,5 +34,5 @@ cp -r ${PROJECT_PATH}/WebBasedCrawler/*.txt ${PACKAGE_PATH}
cp -r ${PROJECT_PATH}/WebBasedCrawler/*/ ${PACKAGE_PATH} cp -r ${PROJECT_PATH}/WebBasedCrawler/*/ ${PACKAGE_PATH}
rm ${PACKAGE_PATH}/AppRun rm ${PACKAGE_PATH}/AppRun
rm ${PACKAGE_PATH}/qt.conf # rm ${PACKAGE_PATH}/qt.conf
rm -r ${PACKAGE_PATH}/translations rm -r ${PACKAGE_PATH}/translations