runner thread가 종료되는 문제 해결
This commit is contained in:
13
WebBasedCrawler/base/debug.py
Normal file
13
WebBasedCrawler/base/debug.py
Normal file
@@ -0,0 +1,13 @@
|
||||
import linecache
|
||||
import sys
|
||||
import base.logger as logger
|
||||
|
||||
|
||||
def print_exception(obj=None):
|
||||
exc_type, exc_obj, tb = sys.exc_info()
|
||||
f = tb.tb_frame
|
||||
lineno = tb.tb_lineno
|
||||
filename = f.f_code.co_filename
|
||||
linecache.checkcache(filename)
|
||||
line = linecache.getline(filename, lineno, f.f_globals)
|
||||
logger.log('({}({}) Exception from "{}"):\n {}, {}'.format(filename, lineno, line.strip(), exc_obj, obj if obj else ''), logger.LogLevel.ERROR)
|
||||
@@ -17,7 +17,7 @@ class CustomFormatter(logging.Formatter):
|
||||
logger = logging.getLogger('mylogger')
|
||||
# formatter = logging.Formatter('[ %(asctime)s][%(threadName)s][%(levelname)s][%(filename)s(%(lineno)s)] > %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
|
||||
# formatter = CustomFormatter('[ %(asctime)s][%(thread)s][%(levelname)s][%(pathname)s(%(lineno)s)]\n> %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
|
||||
formatter = CustomFormatter('', datefmt='%Y-%m-%d %H:%M:%S')
|
||||
formatter = CustomFormatter(datefmt='%Y-%m-%d %H:%M:%S')
|
||||
|
||||
|
||||
logging.handlers.RotatingFileHandler('crawler.log')
|
||||
|
||||
@@ -12,6 +12,7 @@ import threading
|
||||
import random
|
||||
|
||||
import requests
|
||||
import base.debug as dbg
|
||||
|
||||
|
||||
Base = sqlalchemy.ext.declarative.declarative_base()
|
||||
@@ -97,9 +98,8 @@ class Proxy2Handler:
|
||||
def __init__(self):
|
||||
self.lock = threading.Lock()
|
||||
self.engine = sqlalchemy.create_engine('mysql+pymysql://admin:admin123@bigbird.iptime.org/concepters?charset=utf8')
|
||||
session_factory = sqlalchemy.orm.sessionmaker(bind=self.engine, autocommit=True, autoflush=True)
|
||||
self.session = sqlalchemy.orm.scoped_session(session_factory)
|
||||
pass
|
||||
SessionFactory = sqlalchemy.orm.sessionmaker(bind=self.engine, autocommit=True, autoflush=True)
|
||||
self.session = sqlalchemy.orm.scoped_session(SessionFactory)
|
||||
|
||||
def lock_enter(self):
|
||||
# logger.log('lock {}'.format(threading.current_thread().ident))
|
||||
@@ -113,7 +113,7 @@ class Proxy2Handler:
|
||||
|
||||
def commit(self):
|
||||
self.lock_enter()
|
||||
self.session.commit()
|
||||
# self.session.commit()
|
||||
self.lock_leave()
|
||||
|
||||
def get_oldest(self, platform):
|
||||
@@ -171,17 +171,24 @@ class Proxy2Handler:
|
||||
def get(self, platform, proc_id=-1):
|
||||
self.lock_enter()
|
||||
|
||||
try:
|
||||
|
||||
block_column = self.block_field_map[platform]
|
||||
try:
|
||||
instances = self.session.query(Proxy2Model).filter(block_column == None).all()
|
||||
instances = self.session.query(Proxy2Model).filter(block_column == None).limit(32).all()
|
||||
except Exception as e:
|
||||
dbg.print_exception()
|
||||
assert True
|
||||
|
||||
self.lock_leave()
|
||||
|
||||
try:
|
||||
session_factory = sqlalchemy.orm.sessionmaker(bind=self.engine)
|
||||
self.session = sqlalchemy.orm.scoped_session(session_factory)
|
||||
except Exception as e2:
|
||||
logger.log('{} session recreate'.format(proc_id))
|
||||
# try:
|
||||
# session_factory = sqlalchemy.orm.sessionmaker(bind=self.engine)
|
||||
# self.session = sqlalchemy.orm.scoped_session(session_factory)
|
||||
# logger.log('{} session recreate'.format(proc_id))
|
||||
#
|
||||
# except Exception as e2:
|
||||
# dbg.print_exception(e2)
|
||||
|
||||
return None
|
||||
|
||||
@@ -198,6 +205,9 @@ class Proxy2Handler:
|
||||
self.lock_leave()
|
||||
return self.get(platform, proc_id)
|
||||
|
||||
except Exception as e:
|
||||
dbg.print_exception(e)
|
||||
|
||||
def insert(self, ip, port):
|
||||
instance = self.get_instance(ip, port)
|
||||
if not instance:
|
||||
@@ -236,10 +246,13 @@ class Proxy2Handler:
|
||||
# self.session.add(Proxy2Model(proxy['ip'], proxy['port']))
|
||||
|
||||
def set_proxy_blocked(self, ip, port, platform):
|
||||
try:
|
||||
block_column = self.block_field_map[platform]
|
||||
query = self.get_query(ip, port)
|
||||
query.update({block_column: datetime.datetime.now()})
|
||||
self.commit()
|
||||
except Exception as e:
|
||||
dbg.print_exception(e)
|
||||
|
||||
if __name__ == '__main__':
|
||||
proxy_handler = Proxy2Handler()
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from twitter.twconfig import TwitterConfig
|
||||
from twitter.twconfig import TwitterConfig
|
||||
from twitter.twdbhelper import TwitterDBHelper
|
||||
from twitter.tweet import Tweet
|
||||
from twitter.twparser import TweetParser
|
||||
@@ -7,6 +7,7 @@ import base.proxy
|
||||
import base.proxy2 as proxy2
|
||||
import base.baseclasses
|
||||
import base.logger as logger
|
||||
import base.debug as dbg
|
||||
|
||||
import requests
|
||||
import bs4
|
||||
@@ -19,7 +20,6 @@ import time
|
||||
|
||||
|
||||
class TwitterCrawler:
|
||||
|
||||
def __init__(self):
|
||||
self.default_config = TwitterConfig()
|
||||
self.db_helper = TwitterDBHelper()
|
||||
@@ -71,6 +71,7 @@ class TwitterCrawler:
|
||||
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36',
|
||||
'Accept-Language': 'ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4',
|
||||
}
|
||||
|
||||
proxy_key = '{}-{}'.format('runner' if is_runner else 'content', proc_id)
|
||||
if proxy_key not in self.proxy:
|
||||
# self.proxy[proxy_key] = base.proxy.get_proxy_for_requests()
|
||||
@@ -118,6 +119,7 @@ class TwitterCrawler:
|
||||
else:
|
||||
return j
|
||||
|
||||
|
||||
return {
|
||||
'items_html': '',
|
||||
'has_more_items': False,
|
||||
@@ -177,7 +179,7 @@ class TwitterCrawler:
|
||||
})
|
||||
# self.runner_processing[proc_id].value = False
|
||||
except Exception as e:
|
||||
logger.log(e, logger.LogLevel.ERROR)
|
||||
dbg.print_exception(e)
|
||||
|
||||
return proc_id, tweet_count,
|
||||
|
||||
@@ -258,7 +260,7 @@ class TwitterCrawler:
|
||||
test_tw.user_id = 'Awesome_vely'
|
||||
test_tw.tweet_id = 888704413111435264
|
||||
|
||||
test_tw.text = '?œìž‘'
|
||||
test_tw.text = '?<EFBFBD><EFBFBD><EFBFBD><EFBFBD>'
|
||||
self.insert_content_pool(0, content_qu, test_tw, test_tw)
|
||||
|
||||
content_threads = [threading.Thread(target=self.content_proc, args=(proc_id, content_qu, content_result_qu)) for proc_id in range(16)]
|
||||
@@ -301,7 +303,7 @@ class TwitterCrawler:
|
||||
start_time = time.time()
|
||||
|
||||
# run
|
||||
worker_count = 16
|
||||
worker_count = 1
|
||||
split_config = self.default_config.split()
|
||||
|
||||
content_qu = queue.Queue()
|
||||
|
||||
Reference in New Issue
Block a user