diff --git a/WebBasedCrawler/base/proxy_crawler.py b/WebBasedCrawler/base/proxy_crawler.py index 8c2db82..442bdd0 100644 --- a/WebBasedCrawler/base/proxy_crawler.py +++ b/WebBasedCrawler/base/proxy_crawler.py @@ -110,7 +110,7 @@ def crawl_proxies(check_url=None): with open('proxy.txt', 'w') as f: print('proxy crawler dump start') for proxy in proxies_alive: - print(proxy) + # print(proxy) f.write(proxy + '\n') print('proxy crawler dump end') diff --git a/WebBasedCrawler/twitter/twittercrawl.py b/WebBasedCrawler/twitter/twittercrawl.py index 0cdaadd..2ea7937 100644 --- a/WebBasedCrawler/twitter/twittercrawl.py +++ b/WebBasedCrawler/twitter/twittercrawl.py @@ -16,7 +16,7 @@ import queue import time -class TwitterCrawler(): +class TwitterCrawler: def __init__(self): self.default_config = TwitterConfig() @@ -65,7 +65,6 @@ class TwitterCrawler(): resp = None while True: try: - # time.sleep(random.random()) resp = requests.get(url, headers=headers, proxies=self.proxy[proxy_key], timeout=3) except Exception as e: if self.proxy[proxy_key] == (None, None): @@ -79,6 +78,34 @@ class TwitterCrawler(): return resp + def get_page_data(self, url, is_runner, proc_id): + for retry_cnt in range(5): + # get response + resp = self.get_page(url, is_runner, proc_id) + if not resp: + break + + # check response + if resp.status_code == 404: + break + elif resp.status_code != 200: + print('[WARNING] content_get code {}'.format(resp.status_code)) + continue + + # parsing result + j = json.loads(resp.text) + if j['new_latent_count'] <= 0: + proxy_key = '{}-{}'.format('runner' if is_runner else 'content', proc_id) + self.proxy[proxy_key] = base.proxy.get_proxy_for_requests() + continue + else: + return j + + return { + 'items_html': '', + 'has_more_items': False, + } + def runner_proc(self, proc_id, content_queue, result_queue, config): print('[{}] {} to {} runner thread start'.format(proc_id, config.start_str, config.end_str)) @@ -89,19 +116,15 @@ class TwitterCrawler(): tweet_count = 0 while b_continue: - if min_tweet_id is not None: - max_position = 'TWEET-{}-{}'.format(max_tweet_id, min_tweet_id) url = self.get_timeline_url(config.keywords[0], config.start_str, config.end_str, max_position) - resp = self.get_page(url, True, proc_id) - if resp is None: - break - - j = json.loads(resp.content.decode('utf-8')) + j = self.get_page_data(url, True, proc_id) soup = bs4.BeautifulSoup(j['items_html'], 'lxml') tweet_tags = soup.select("div.tweet") + tweet_ids = [] for tw in tweet_tags: tweet = TweetParser.parse(tw, config.keyword_id) + tweet_ids.append(tweet.tweet_id) if tweet.is_reply is True: # print(' ## {}: {}...'.format(tweet.user_name, tweet.text[:20])) @@ -109,20 +132,25 @@ class TwitterCrawler(): if tweet.reply_cnt > 0: self.insert_content_pool(proc_id, content_queue, tweet, tweet) - self.db_helper.insert_tweet(tweet, config.db_num) # print('{} {}: {}...'.format(tweet.created_at, tweet.user_name, tweet.text[:20])) - print('[{}] body {} [{}]'.format(proc_id, tweet.top_link, 'ok')) + print('[{}] body {} ({}) [{}]'.format(proc_id, tweet.top_link, tweet.created_at, 'ok')) count = len(tweet_tags) tweet_count += count - b_continue = j['has_more_items'] + b_continue = count > 0 + # b_continue = j['has_more_items'] if b_continue: if min_tweet_id is None: - min_tweet_id = tweet_tags[0].attrs['data-item-id'] - max_tweet_id = tweet_tags[-1].attrs['data-item-id'] + min_tweet_id = tweet_ids[0] + max_tweet_id = tweet_ids[-1] + + if 'min_position' in j: + max_position = j['min_position'] + else: + max_position = 'TWEET-{}-{}'.format(max_tweet_id, min_tweet_id) print('[{}] {} to {} runner thread finished {}'.format(proc_id, config.start_str, config.end_str, tweet_count)) result_queue.put({ @@ -169,14 +197,7 @@ class TwitterCrawler(): b_continue = True while b_continue: url = self.get_content_url(parent_tw.user_id, parent_tw.tweet_id, max_position) - resp = self.get_page(url, False, proc_id) - if resp is None or resp.status_code == 404: - break - elif resp.status_code != 200: - print('[WARNING] content_get code {}'.format(resp.status_code)) - continue - - j = json.loads(resp.content.decode('utf-8')) + j = self.get_page_data(url, False, proc_id) soup = bs4.BeautifulSoup(j['items_html'], 'lxml') reply_container_tags = soup.select('li.ThreadedConversation') @@ -186,7 +207,7 @@ class TwitterCrawler(): if len(tweet_tags) > 0: tweet = TweetParser.parse(tweet_tags[0], self.default_config.keyword_id, parent_tw.depth+1, top_tw) # print('[{}]>>> {} {}: {} ({}) ({})'.format(proc_id, tweet.created_at, tweet.user_name, tweet.text[:20], tweet.depth, tweet.tweet_link)) - # print('[{}] reply {} [{}]'.format(proc_id, tweet.top_link, 'ok')) + print('[{}] reply {} [{}]'.format(proc_id, tweet.top_link, 'ok')) self.insert_content_pool(proc_id, content_queue, tweet, top_tw) self.db_helper.insert_tweet(tweet, self.default_config.db_num) tweet_count += 1 @@ -260,7 +281,7 @@ class TwitterCrawler(): start_time = time.time() # run - worker_count = 64 + worker_count = 16 split_config = self.default_config.split() content_qu = queue.Queue() @@ -282,34 +303,35 @@ class TwitterCrawler(): # rerun zero runners print('restart failed runner') - runner_pool = concurrent.futures.ThreadPoolExecutor(max_workers=worker_count) - runner_result_qu2 = queue.Queue() - idx = 0 - b_rerun = False + for retry in range(5): + runner_pool = concurrent.futures.ThreadPoolExecutor(max_workers=worker_count) + runner_result_qu2 = queue.Queue() + b_rerun = False + while not runner_result_qu.empty(): + res = runner_result_qu.get() + runner_result_cnt += res['count'] + proc_id = res['proc_id'] + if res['count'] == 0: + runner_pool.submit(self.runner_proc, proc_id, content_qu, runner_result_qu2, split_config[proc_id]) + b_rerun = True + + while not content_result_qu.empty(): + res = content_result_qu.get() + content_result_cnt += res['count'] + + if b_rerun: + content_pool = concurrent.futures.ThreadPoolExecutor(max_workers=worker_count) + [content_pool.submit(self.content_proc, proc_id, content_qu, content_result_qu) for proc_id in range(worker_count)] + + runner_pool.shutdown(wait=True) + content_pool.shutdown(wait=True) + self.db_helper.flush() + + runner_result_qu = runner_result_qu2 + while not runner_result_qu.empty(): res = runner_result_qu.get() runner_result_cnt += res['count'] - if res['count'] == 0: - runner_pool.submit(self.runner_proc, idx, content_qu, runner_result_qu2, split_config[idx]) - b_rerun = True - - idx += 1 - - while not content_result_qu.empty(): - res = content_result_qu.get() - content_result_cnt += res['count'] - - if b_rerun: - content_pool = concurrent.futures.ThreadPoolExecutor(max_workers=worker_count) - [content_pool.submit(self.content_proc, proc_id, content_qu, content_result_qu) for proc_id in range(worker_count)] - - runner_pool.shutdown(wait=True) - content_pool.shutdown(wait=True) - self.db_helper.flush() - - while not runner_result_qu2.empty(): - res = runner_result_qu2.get() - runner_result_cnt += res['count'] while not content_result_qu.empty(): res = content_result_qu.get() @@ -326,7 +348,6 @@ class TwitterCrawler(): def start(self): - # self.debug() # return diff --git a/package-clients.bat b/package-clients.bat index d166e3b..03ea992 100644 --- a/package-clients.bat +++ b/package-clients.bat @@ -1,7 +1,7 @@ @echo off SET CUR_PATH=%cd% -SET DEPLOY_PATH=%CUR_PATH%\clients-win +SET DEPLOY_PATH=%CUR_PATH%\..\clients-win SET BUILD_PATH_PREFIX=%CUR_PATH%\build- SET BUILD_PATH_POSTFIX=-Desktop_Qt_5_4_2_MSVC2013_64bit @@ -14,9 +14,9 @@ SET FILTER_PATH=%BUILD_PATH_PREFIX%Filter%BUILD_PATH_POSTFIX%\%CONFIGURE% SET FILTER_PROCESS_PATH=%BUILD_PATH_PREFIX%FilterProcess%BUILD_PATH_POSTFIX%\%CONFIGURE% SET QT_PATH=C:\Qt\Qt5.4.2\5.4\msvc2013_64\bin -SET MYSQL_PATH=C:\Data\Project\temp\libmysql\64bit +SET MYSQL_PATH=%CUR_PATH%\..\lib\libmysql\64bit -SET PYTHONCRAWLER_PATH=C:\Data\Project\clients\WebBasedCrawler +SET PYTHONCRAWLER_PATH=%CUR_PATH%\WebBasedCrawler mkdir %DEPLOY_PATH% @@ -36,6 +36,7 @@ cd %CUR_PATH% copy %MYSQL_PATH%\libmysql.dll %DEPLOY_PATH% copy %CUR_PATH%\*.txt %DEPLOY_PATH%\ +copy %CUR_PATH%\*.ini %DEPLOY_PATH%\ xcopy %PYTHONCRAWLER_PATH%\*.py %DEPLOY_PATH%\ /c /d /s /y xcopy %PYTHONCRAWLER_PATH%\*.txt %DEPLOY_PATH%\ /c /d /s /y diff --git a/setup.ini b/setup.ini new file mode 100644 index 0000000..4954ffe --- /dev/null +++ b/setup.ini @@ -0,0 +1,48 @@ +#데이타 베이스 +[#database] +host=db.big-bird.co.kr +user=concepters +pass=con97996655 +name=dbconcepters + +# bigbird +[#database] +host=182.162.171.147 +user=admin +pass=con2214lac! +name=bigbird + +# bigbird1 +[database] +host=192.168.0.30 +user=admin +pass=admin123 +name=bigbird + +# 선컨셉터님 +[#database] +host=192.168.0.82 +user=root +pass=1234 +name=bigbird + + +#설정 날짜로 자르기 +[settings] +#days=-91 +from=2016-01-01 +to=2017-12-31 +upload=o +debug=o + +[spammers] +body_cut = 1.0 +body_ratio = 0.3 +reply_cut = 1.0 +reply_ratio = 0.5 +nick_cut = 1.0 +nick_ratio = 0.2 + +[influencers] +A = 0.6 +B = 0.4 \ No newline at end of file