웹크롤러 파이선 카카오스토리 부분 디버깅
git-svn-id: svn://192.168.0.12/source@294 8346c931-da38-4b9b-9d4c-e48b93cbd075
This commit is contained in:
@@ -803,7 +803,11 @@ class InstaAlgorithmMulti(InstaAlgorithm):
|
|||||||
# printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
|
# printl(element['date'].strftime("%Y-%m-%d %H:%M:%S"))
|
||||||
# wait(1.5)
|
# wait(1.5)
|
||||||
# self.crawl_content(element['url'], list_crawler.get_cookies(), list_crawler.get_url())
|
# self.crawl_content(element['url'], list_crawler.get_cookies(), list_crawler.get_url())
|
||||||
self.list_crawl.put(element)
|
try:
|
||||||
|
self.list_crawl.put(element, timeout=10)
|
||||||
|
except Exception as e:
|
||||||
|
printl(e)
|
||||||
|
printl("queue size = ", self.list_crawl.qsize())
|
||||||
backup_set.add(element['url'])
|
backup_set.add(element['url'])
|
||||||
self.total_num += 1
|
self.total_num += 1
|
||||||
if self.is_until_page():
|
if self.is_until_page():
|
||||||
@@ -869,12 +873,15 @@ class InstaAlgorithmMulti(InstaAlgorithm):
|
|||||||
|
|
||||||
# stop child process
|
# stop child process
|
||||||
for i in range(num_of_content_process):
|
for i in range(num_of_content_process):
|
||||||
self.list_crawl.put(None)
|
self.list_crawl.put(None, timeout=10)
|
||||||
|
|
||||||
# wait child process
|
# wait child process
|
||||||
for p in p_list:
|
for p in p_list:
|
||||||
p.join()
|
p.join()
|
||||||
|
|
||||||
|
for _ in range(self.list_crawl.qsize()):
|
||||||
|
self.list_crawl.get(block=False)
|
||||||
|
|
||||||
i += 1
|
i += 1
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.info(e)
|
logging.info(e)
|
||||||
|
|||||||
@@ -114,7 +114,7 @@ class KakaoInit(CrawlInit):
|
|||||||
date_now = datetime.datetime.now()
|
date_now = datetime.datetime.now()
|
||||||
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
|
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
|
||||||
result += datetime.timedelta(days=self.before_day)
|
result += datetime.timedelta(days=self.before_day)
|
||||||
return result
|
return result.date()
|
||||||
else:
|
else:
|
||||||
return self.start_day()
|
return self.start_day()
|
||||||
|
|
||||||
@@ -122,7 +122,7 @@ class KakaoInit(CrawlInit):
|
|||||||
if self.is_realtime():
|
if self.is_realtime():
|
||||||
date_now = datetime.datetime.now()
|
date_now = datetime.datetime.now()
|
||||||
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
|
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
|
||||||
return result
|
return result.date()
|
||||||
else:
|
else:
|
||||||
return self.end_day()
|
return self.end_day()
|
||||||
|
|
||||||
@@ -967,6 +967,7 @@ class KakaoMainCrawler:
|
|||||||
i += 1
|
i += 1
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.info(e)
|
logging.info(e)
|
||||||
|
# check for exception
|
||||||
# self.driver.quit()
|
# self.driver.quit()
|
||||||
self.set_driver(self.browser.new_browser())
|
self.set_driver(self.browser.new_browser())
|
||||||
wait(5)
|
wait(5)
|
||||||
@@ -975,5 +976,5 @@ class KakaoMainCrawler:
|
|||||||
printl("Finished Crawling :)")
|
printl("Finished Crawling :)")
|
||||||
|
|
||||||
self.send_to_db.close()
|
self.send_to_db.close()
|
||||||
# self.driver.quit()
|
self.driver.quit()
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user