Files
clients/WebBasedCrawler/webbasedcrawler.py
admin 21b11500bd selenium, beautifulsoup4로 구현한 python 크롤러
git-svn-id: svn://192.168.0.12/source@241 8346c931-da38-4b9b-9d4c-e48b93cbd075
2016-01-19 06:52:00 +00:00

93 lines
3.1 KiB
Python

#-*- coding: utf-8 -*-
'''
Created on 2015. 12. 8.
@author: cococo
'''
import sys
from insta import instacrawl
from kakao import kakaocrawl
from naver import navercrawl
from facebook import facebookcrawl
from facebook import facebookcrawlbs
from base.baseclasses import print_and_flush
class WebBasedCrawler:
def __init__(self, browser=None, platform=None, keyword_id=None, db_num=None, before_day=None, until_page=None):
self.set_arguments(browser, platform, keyword_id, db_num, before_day, until_page)
def set_arguments(self, browser, platform, keyword_id, db_num, before_day, until_page):
if platform == "instagram":
self.crawler = instacrawl.InstaMainCrawler()
elif platform == "kakaochannel":
self.crawler = kakaocrawl.KakaoMainCrawler()
elif platform == "navercafe":
self.crawler = navercrawl.NaverCafeMainAreaCrawler()
elif platform == "facebook":
self.crawler = facebookcrawlbs.FacebookMainCrawler()
else:
self.crawler = None
raise Exception
self.crawler.set_arguments(browser, keyword_id, db_num, before_day, until_page)
def start(self):
self.crawler.start()
browser_opt = ('chrome', "ie", "opera", "firefox")
platform_opt = ('instagram', 'kakaostory', 'navercafe', "facebook")
def get_browser_info(platform_, file_name="browser.txt"):
if sys.platform == 'win32':
options = {'default': 'ie'}
else:
options = {'default': 'firefox'}
try:
with open(file_name, 'r') as f:
for line in f:
if line.startswith("#"):
continue
elif len(line.strip()) < 1:
continue
else:
platform, browser = line.split("=")
platform = platform.strip()
browser = browser.strip()
if (platform not in options.keys() and platform not in platform_opt) or browser not in browser_opt:
print_and_flush("check option: " + line)
else:
options[platform] = browser
except FileNotFoundError:
print_and_flush("browser.txt file is not exists")
print_and_flush("use " + options['default'] + " browser")
except Exception as e:
print_and_flush(e)
print_and_flush("Unknown error occurs")
exit(1)
return options.get(platform_, options['default'])
if __name__ == '__main__':
"""
sys.argv[0] webbasedcrawler.py
sys.argv[1] instagram, kakaochannel, navercafe, facebook
sys.argv[2] keyword_id
sys.argv[3] data group
sys.argv[4] start_day
sys.argv[5] until_page
"""
if len(sys.argv) == 6:
print_and_flush("Python Crawling Executed")
else:
print_and_flush("Check Argumenets!")
exit(1)
crawler = WebBasedCrawler(get_browser_info(sys.argv[1]), sys.argv[1], sys.argv[2],
sys.argv[3], sys.argv[4], sys.argv[5])
crawler.start()
print_and_flush("Finished Crawling :)")
exit(0)