diff --git a/WebBasedCrawler/base/proxy2.py b/WebBasedCrawler/base/proxy2.py index adb61e3..2d3c33e 100644 --- a/WebBasedCrawler/base/proxy2.py +++ b/WebBasedCrawler/base/proxy2.py @@ -132,7 +132,7 @@ class Proxy2Handler: return instance def check_all_proxies(self, platform): - print('check all start') + # print('check all start') url_map = { Platform.NAVER: 'https://www.naver.com', @@ -165,7 +165,7 @@ class Proxy2Handler: else: instance.set_block_at(platform, datetime.datetime.now()) - print('check all end') + # print('check all end') return alive_cnt def get(self, platform, proc_id=-1): @@ -192,7 +192,10 @@ class Proxy2Handler: return None - instance = instances[random.randint(0, len(instances)-1)] if len(instances) > 0 else None + instance = None + if len(instances) > 0: + instance = instances[random.randint(0, len(instances)-1)] if len(instances) > 0 else None + if instance: self.lock_leave() return instance.get_instance_for_http() diff --git a/WebBasedCrawler/base/proxy_crawler.py b/WebBasedCrawler/base/proxy_crawler.py index dba1aca..403817a 100644 --- a/WebBasedCrawler/base/proxy_crawler.py +++ b/WebBasedCrawler/base/proxy_crawler.py @@ -86,12 +86,12 @@ def check_proxy(qu, proxy, url): def crawl_proxies(check_url=None): - print('proxy crawling start') + # print('proxy crawling start') proxies = get_proxies_free_proxy() proxies += get_proxies_proxy_searcher() # proxies += get_proxies_nntime() # proxies = list(set(proxies)) - print('proxy crawled {}'.format(len(proxies))) + # print('proxy crawled {}'.format(len(proxies))) if check_url: qu = queue.Queue() @@ -111,7 +111,7 @@ def crawl_proxies(check_url=None): else: proxies_alive = proxies - print('proxy crawling end') + # print('proxy crawling end') return proxies_alive # proxies_alive.sort() diff --git a/WebBasedCrawler/twitter/twittercrawl.py b/WebBasedCrawler/twitter/twittercrawl.py index eceab9b..0e9d15f 100644 --- a/WebBasedCrawler/twitter/twittercrawl.py +++ b/WebBasedCrawler/twitter/twittercrawl.py @@ -303,7 +303,7 @@ class TwitterCrawler: start_time = time.time() # run - worker_count = 1 + worker_count = 16 split_config = self.default_config.split() content_qu = queue.Queue()