runner thread가 종료되는 문제 해결

This commit is contained in:
mjjo
2017-08-10 12:34:38 +09:00
parent 9e51f989fd
commit 16a9afbd9f
4 changed files with 418 additions and 390 deletions

View File

@@ -0,0 +1,13 @@
import linecache
import sys
import base.logger as logger
def print_exception(obj=None):
exc_type, exc_obj, tb = sys.exc_info()
f = tb.tb_frame
lineno = tb.tb_lineno
filename = f.f_code.co_filename
linecache.checkcache(filename)
line = linecache.getline(filename, lineno, f.f_globals)
logger.log('({}({}) Exception from "{}"):\n {}, {}'.format(filename, lineno, line.strip(), exc_obj, obj if obj else ''), logger.LogLevel.ERROR)

View File

@@ -17,7 +17,7 @@ class CustomFormatter(logging.Formatter):
logger = logging.getLogger('mylogger')
# formatter = logging.Formatter('[ %(asctime)s][%(threadName)s][%(levelname)s][%(filename)s(%(lineno)s)] > %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
# formatter = CustomFormatter('[ %(asctime)s][%(thread)s][%(levelname)s][%(pathname)s(%(lineno)s)]\n> %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
formatter = CustomFormatter('', datefmt='%Y-%m-%d %H:%M:%S')
formatter = CustomFormatter(datefmt='%Y-%m-%d %H:%M:%S')
logging.handlers.RotatingFileHandler('crawler.log')

View File

@@ -12,6 +12,7 @@ import threading
import random
import requests
import base.debug as dbg
Base = sqlalchemy.ext.declarative.declarative_base()
@@ -97,9 +98,8 @@ class Proxy2Handler:
def __init__(self):
self.lock = threading.Lock()
self.engine = sqlalchemy.create_engine('mysql+pymysql://admin:admin123@bigbird.iptime.org/concepters?charset=utf8')
session_factory = sqlalchemy.orm.sessionmaker(bind=self.engine, autocommit=True, autoflush=True)
self.session = sqlalchemy.orm.scoped_session(session_factory)
pass
SessionFactory = sqlalchemy.orm.sessionmaker(bind=self.engine, autocommit=True, autoflush=True)
self.session = sqlalchemy.orm.scoped_session(SessionFactory)
def lock_enter(self):
# logger.log('lock {}'.format(threading.current_thread().ident))
@@ -113,7 +113,7 @@ class Proxy2Handler:
def commit(self):
self.lock_enter()
self.session.commit()
# self.session.commit()
self.lock_leave()
def get_oldest(self, platform):
@@ -171,17 +171,24 @@ class Proxy2Handler:
def get(self, platform, proc_id=-1):
self.lock_enter()
try:
block_column = self.block_field_map[platform]
try:
instances = self.session.query(Proxy2Model).filter(block_column == None).all()
instances = self.session.query(Proxy2Model).filter(block_column == None).limit(32).all()
except Exception as e:
dbg.print_exception()
assert True
self.lock_leave()
try:
session_factory = sqlalchemy.orm.sessionmaker(bind=self.engine)
self.session = sqlalchemy.orm.scoped_session(session_factory)
except Exception as e2:
logger.log('{} session recreate'.format(proc_id))
# try:
# session_factory = sqlalchemy.orm.sessionmaker(bind=self.engine)
# self.session = sqlalchemy.orm.scoped_session(session_factory)
# logger.log('{} session recreate'.format(proc_id))
#
# except Exception as e2:
# dbg.print_exception(e2)
return None
@@ -198,6 +205,9 @@ class Proxy2Handler:
self.lock_leave()
return self.get(platform, proc_id)
except Exception as e:
dbg.print_exception(e)
def insert(self, ip, port):
instance = self.get_instance(ip, port)
if not instance:
@@ -236,10 +246,13 @@ class Proxy2Handler:
# self.session.add(Proxy2Model(proxy['ip'], proxy['port']))
def set_proxy_blocked(self, ip, port, platform):
try:
block_column = self.block_field_map[platform]
query = self.get_query(ip, port)
query.update({block_column: datetime.datetime.now()})
self.commit()
except Exception as e:
dbg.print_exception(e)
if __name__ == '__main__':
proxy_handler = Proxy2Handler()

View File

@@ -1,4 +1,4 @@
from twitter.twconfig import TwitterConfig
from twitter.twconfig import TwitterConfig
from twitter.twdbhelper import TwitterDBHelper
from twitter.tweet import Tweet
from twitter.twparser import TweetParser
@@ -7,6 +7,7 @@ import base.proxy
import base.proxy2 as proxy2
import base.baseclasses
import base.logger as logger
import base.debug as dbg
import requests
import bs4
@@ -19,7 +20,6 @@ import time
class TwitterCrawler:
def __init__(self):
self.default_config = TwitterConfig()
self.db_helper = TwitterDBHelper()
@@ -71,6 +71,7 @@ class TwitterCrawler:
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36',
'Accept-Language': 'ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4',
}
proxy_key = '{}-{}'.format('runner' if is_runner else 'content', proc_id)
if proxy_key not in self.proxy:
# self.proxy[proxy_key] = base.proxy.get_proxy_for_requests()
@@ -118,6 +119,7 @@ class TwitterCrawler:
else:
return j
return {
'items_html': '',
'has_more_items': False,
@@ -177,7 +179,7 @@ class TwitterCrawler:
})
# self.runner_processing[proc_id].value = False
except Exception as e:
logger.log(e, logger.LogLevel.ERROR)
dbg.print_exception(e)
return proc_id, tweet_count,
@@ -258,7 +260,7 @@ class TwitterCrawler:
test_tw.user_id = 'Awesome_vely'
test_tw.tweet_id = 888704413111435264
test_tw.text = '?œìž‘'
test_tw.text = '?<EFBFBD><EFBFBD><EFBFBD><EFBFBD>'
self.insert_content_pool(0, content_qu, test_tw, test_tw)
content_threads = [threading.Thread(target=self.content_proc, args=(proc_id, content_qu, content_result_qu)) for proc_id in range(16)]
@@ -301,7 +303,7 @@ class TwitterCrawler:
start_time = time.time()
# run
worker_count = 16
worker_count = 1
split_config = self.default_config.split()
content_qu = queue.Queue()