- 트위터 크롤러 수정
- 중복 제거 후 insert - proxy.txt가 모두 만료되면 db 사용 - proxy db에서 중복 제거해서 가져오기 - 프록시 문제로 페이지 요청 시 0.1초 딜레이 - 크롤러 stop 동작하도록 - realtime 적용
This commit is contained in:
@@ -52,6 +52,10 @@ class DataDBRow:
|
||||
|
||||
return values
|
||||
|
||||
def get_delete_query(self, db_num):
|
||||
query = """delete from data_{} where article_url='{}'""".format(db_num, self.article_url)
|
||||
return query
|
||||
|
||||
def get_insert_query(self, conn, db_num):
|
||||
|
||||
inst = DataDBRow()
|
||||
|
||||
@@ -101,6 +101,9 @@ _expired_proxies = []
|
||||
|
||||
|
||||
def set_proxy_expired(proxy):
|
||||
if not os.path.exists(proxy_filename) or not os.path.isfile(proxy_filename):
|
||||
return
|
||||
|
||||
if proxy not in _expired_proxies:
|
||||
_expired_proxies.append(proxy)
|
||||
|
||||
@@ -134,7 +137,7 @@ def get_proxy_from_file(filename):
|
||||
m = re_ip.search(proxy_lists[random.randint(0, len(proxy_lists) - 1)])
|
||||
if m:
|
||||
return m.group(1), m.group(2)
|
||||
return (None, None)
|
||||
return None, None
|
||||
|
||||
|
||||
def get_proxy_from_db():
|
||||
@@ -144,29 +147,33 @@ def get_proxy_from_db():
|
||||
db='concepters', charset='utf8',
|
||||
cursorclass=pymysql.cursors.DictCursor)
|
||||
with conn.cursor() as cursor:
|
||||
cursor.execute("select * from Proxy")
|
||||
proxy_lists = [(str(i['Proxy']), str(i['Port'])) for i in cursor if i['Port']]
|
||||
cursor.execute("select * from Proxy group by Proxy")
|
||||
proxy_lists = [(str(i['Proxy']), str(i['Port'])) for i in cursor if i['Proxy'] and i['Port']]
|
||||
proxy_lists.sort()
|
||||
conn.close()
|
||||
return proxy_lists[random.randint(0, len(proxy_lists) - 1)] if proxy_lists else (None, None)
|
||||
except:
|
||||
conn.close()
|
||||
return (None, None)
|
||||
return None, None
|
||||
|
||||
|
||||
def get_proxy():
|
||||
if os.path.exists(proxy_filename) and os.path.isfile(proxy_filename):
|
||||
return get_proxy_from_file(proxy_filename)
|
||||
ip, port = get_proxy_from_file(proxy_filename)
|
||||
if not ip or not port:
|
||||
return get_proxy_from_db()
|
||||
else:
|
||||
return ip, port
|
||||
else:
|
||||
return get_proxy_from_db()
|
||||
|
||||
|
||||
def get_requests_proxy(proxies):
|
||||
return {'http': 'http://' + proxies, 'https': 'http://' + proxies}
|
||||
return {
|
||||
'http': 'http://{}'.format(proxies),
|
||||
'https': 'https://{}'.format(proxies),
|
||||
}
|
||||
|
||||
|
||||
def get_proxy_for_requests():
|
||||
ip, port = get_proxy()
|
||||
return get_requests_proxy(ip + ":" + port)
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -27,7 +27,7 @@ class TwitterConfig:
|
||||
self.db_num = int(db_num)
|
||||
|
||||
self.id = int(params['id'])
|
||||
self.realtime = params['realtime'] == '1'
|
||||
self.realtime = params['realtime'] == 1
|
||||
|
||||
self.keywords = []
|
||||
for keyword in params['searches'].split(','):
|
||||
@@ -35,14 +35,22 @@ class TwitterConfig:
|
||||
|
||||
self.start_str = str(params['start'])
|
||||
self.end_str = str(params['end'])
|
||||
|
||||
self.start = datetime.datetime.combine(params['start'], datetime.datetime.min.time())
|
||||
self.end = datetime.datetime.combine(params['end'], datetime.datetime.min.time())
|
||||
self.start = datetime.datetime.strptime(self.start_str, '%Y-%m-%d')
|
||||
self.end = datetime.datetime.strptime(self.end_str, '%Y-%m-%d')
|
||||
|
||||
self.authorship = params['authorship']
|
||||
self.state = params['state']
|
||||
self.platform = params['platform']
|
||||
|
||||
def reload_realtime(self, before_day):
|
||||
if not self.realtime:
|
||||
return
|
||||
|
||||
self.end_str = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d')
|
||||
self.end = datetime.datetime.strptime(self.end_str, '%Y-%m-%d')
|
||||
self.start = self.end + datetime.timedelta(days=int(before_day))
|
||||
self.start_str = datetime.datetime.strftime(self.start, '%Y-%m-%d')
|
||||
|
||||
def split(self):
|
||||
split_list = []
|
||||
new_end = self.end
|
||||
|
||||
@@ -68,6 +68,9 @@ class TwitterDBHelper:
|
||||
try:
|
||||
with conn.cursor() as cursor:
|
||||
for tweet, _db_num in local_buffer:
|
||||
if not tweet.is_reply:
|
||||
query = tweet.get_delete_query(_db_num)
|
||||
cursor.execute(query)
|
||||
query = tweet.get_insert_query(conn, _db_num)
|
||||
cursor.execute(query)
|
||||
conn.commit()
|
||||
|
||||
@@ -20,9 +20,12 @@ class TwitterCrawler():
|
||||
def __init__(self):
|
||||
self.default_config = TwitterConfig()
|
||||
self.db_helper = TwitterDBHelper()
|
||||
self.proxy = None
|
||||
self.before_day = None
|
||||
|
||||
def set_arguments(self, browser, keyword_id, db_num, before_day, until_page):
|
||||
params = self.db_helper.get_param(keyword_id)
|
||||
self.before_day = before_day
|
||||
self.default_config.set_param(keyword_id, db_num, params)
|
||||
|
||||
@staticmethod
|
||||
@@ -49,26 +52,26 @@ class TwitterCrawler():
|
||||
url_tupple = (TwitterConfig.protocol, TwitterConfig.top_url, sub_url, '', urllib.parse.urlencode(params), '')
|
||||
return urllib.parse.urlunparse(url_tupple)
|
||||
|
||||
@staticmethod
|
||||
def get_page(url, proc_id):
|
||||
def get_page(self, url, proc_id):
|
||||
headers = {
|
||||
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36',
|
||||
'Accept-Language': 'ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4',
|
||||
}
|
||||
# if proxies is None:
|
||||
proxies = base.proxy.get_proxy_for_requests()
|
||||
if not self.proxy:
|
||||
self.proxy = base.proxy.get_proxy_for_requests()
|
||||
|
||||
resp = None
|
||||
while True:
|
||||
try:
|
||||
resp = requests.get(url, headers=headers, proxies=proxies, timeout=3)
|
||||
time.sleep(0.1)
|
||||
resp = requests.get(url, headers=headers, proxies=self.proxy, timeout=3)
|
||||
except Exception as e:
|
||||
if proxies == (None, None):
|
||||
if self.proxy == (None, None):
|
||||
break
|
||||
|
||||
print('[{}] proxy {} is expired. ({})'.format(proc_id, proxies, e))
|
||||
base.proxy.set_proxy_expired(proxies)
|
||||
proxies = base.proxy.get_proxy_for_requests()
|
||||
print('[{}] proxy {} is expired. ({})'.format(proc_id, self.proxy, e))
|
||||
base.proxy.set_proxy_expired(self.proxy)
|
||||
self.proxy = base.proxy.get_proxy_for_requests()
|
||||
else:
|
||||
break
|
||||
|
||||
@@ -108,6 +111,7 @@ class TwitterCrawler():
|
||||
self.db_helper.insert_tweet(tweet, config.db_num)
|
||||
|
||||
# print('{} {}: {}...'.format(tweet.created_at, tweet.user_name, tweet.text[:20]))
|
||||
print('body {} [{}]'.format(tweet.top_link, 'ok'))
|
||||
|
||||
count = len(tweet_tags)
|
||||
if count == 0:
|
||||
@@ -119,7 +123,10 @@ class TwitterCrawler():
|
||||
tweet_count += count
|
||||
|
||||
print('{} to {} runner thread finished {}'.format(config.start_str, config.end_str, tweet_count))
|
||||
result_queue.put((proc_id, tweet_count, ))
|
||||
result_queue.put({
|
||||
'proc_id': proc_id,
|
||||
'count': tweet_count,
|
||||
})
|
||||
# self.runner_processing[proc_id].value = False
|
||||
return proc_id, tweet_count,
|
||||
|
||||
@@ -135,7 +142,7 @@ class TwitterCrawler():
|
||||
try:
|
||||
parent_tw, top_tw, = content_queue.get(block=True, timeout=2)
|
||||
except Exception as e:
|
||||
if time.time()-sleep_time > 60:
|
||||
if time.time()-sleep_time > 15:
|
||||
break
|
||||
else:
|
||||
continue
|
||||
@@ -177,6 +184,7 @@ class TwitterCrawler():
|
||||
if len(tweet_tags) > 0:
|
||||
tweet = TweetParser.parse(tweet_tags[0], self.default_config.keyword_id, parent_tw.depth+1, top_tw)
|
||||
# print('[{}]>>> {} {}: {} ({}) ({})'.format(proc_id, tweet.created_at, tweet.user_name, tweet.text[:20], tweet.depth, tweet.tweet_link))
|
||||
print('reply {} [{}]'.format(tweet.top_link, 'ok'))
|
||||
self.insert_content_pool(proc_id, content_queue, tweet, top_tw)
|
||||
self.db_helper.insert_tweet(tweet, self.default_config.db_num)
|
||||
tweet_count += 1
|
||||
@@ -185,7 +193,11 @@ class TwitterCrawler():
|
||||
if b_continue:
|
||||
max_position = j['min_position']
|
||||
|
||||
result_queue.put((proc_id, tweet_count))
|
||||
result_queue.put({
|
||||
'proc_id': proc_id,
|
||||
'count': tweet_count,
|
||||
})
|
||||
|
||||
print('[{}] content thread finished'.format(proc_id))
|
||||
return proc_id, tweet_count,
|
||||
|
||||
@@ -242,18 +254,19 @@ class TwitterCrawler():
|
||||
print("debug end")
|
||||
# exit()
|
||||
|
||||
def start(self):
|
||||
def run(self):
|
||||
start_time = time.time()
|
||||
|
||||
# self.debug()
|
||||
# return
|
||||
|
||||
# run
|
||||
split_config = self.default_config.split()
|
||||
|
||||
content_qu = queue.Queue()
|
||||
runner_result_qu = queue.Queue()
|
||||
content_result_qu = queue.Queue()
|
||||
|
||||
runner_result_cnt = 0
|
||||
content_result_cnt = 0
|
||||
|
||||
runner_threads = [threading.Thread(target=self.runner_proc, args=(proc_id, content_qu, runner_result_qu, config)) for proc_id, config in enumerate(split_config)]
|
||||
content_threads = [threading.Thread(target=self.content_proc, args=(proc_id, content_qu, content_result_qu)) for proc_id in range(16)]
|
||||
|
||||
@@ -269,12 +282,17 @@ class TwitterCrawler():
|
||||
idx = 0
|
||||
while not runner_result_qu.empty():
|
||||
res = runner_result_qu.get()
|
||||
if res == 0:
|
||||
runner_result_cnt += res['count']
|
||||
if res['count'] == 0:
|
||||
th = threading.Thread(target=self.runner_proc, args=(idx, content_qu, runner_result_qu2, split_config[idx]))
|
||||
runner_threads.append(th)
|
||||
|
||||
idx += 1
|
||||
content_threads = [threading.Thread(target=self.content_proc, args=(proc_id, content_qu, content_result_qu)) for proc_id in range(16)]
|
||||
|
||||
if len(runner_threads) > 0:
|
||||
content_threads = [threading.Thread(target=self.content_proc, args=(proc_id, content_qu, content_result_qu)) for proc_id in range(16)]
|
||||
else:
|
||||
content_threads = []
|
||||
|
||||
[th.start() for th in runner_threads]
|
||||
[th.start() for th in content_threads]
|
||||
@@ -282,8 +300,33 @@ class TwitterCrawler():
|
||||
[th.join() for th in runner_threads]
|
||||
[th.join() for th in content_threads]
|
||||
|
||||
while not runner_result_qu2.empty():
|
||||
res = runner_result_qu2.get()
|
||||
runner_result_cnt += res['count']
|
||||
|
||||
while not content_result_qu.empty():
|
||||
res = content_result_qu.get()
|
||||
content_result_cnt += res['count']
|
||||
|
||||
print('total body count: {}'.format(runner_result_cnt))
|
||||
print('total reply count: {}'.format(content_result_cnt))
|
||||
|
||||
# print running time
|
||||
delta = time.time() - start_time
|
||||
m, s = divmod(delta, 60)
|
||||
h, m = divmod(m, 60)
|
||||
print("finished all {}:{:02d}:{:02d} ".format(int(h), int(m), int(s)))
|
||||
|
||||
def start(self):
|
||||
|
||||
|
||||
# self.debug()
|
||||
# return
|
||||
|
||||
# run
|
||||
while True:
|
||||
self.default_config.reload_realtime(self.before_day)
|
||||
self.run()
|
||||
|
||||
if not self.default_config.realtime:
|
||||
break
|
||||
|
||||
Reference in New Issue
Block a user