Files
clients/WebBasedCrawler/webbasedcrawler.py
2017-07-18 11:12:43 +09:00

98 lines
3.4 KiB
Python

#-*- coding: utf-8 -*-
'''
Created on 2015. 12. 8.
@author: cococo
'''
import sys
from insta import instacrawl
from kakao import kakaocrawl
from naver import navercrawl
from facebook import facebookcrawl
from facebook import facebookcrawlbs
from twitter import twittercrawl
from youtube import youtubecrawl
from base.baseclasses import print_and_flush
class WebBasedCrawler:
def __init__(self, browser=None, platform=None, keyword_id=None, db_num=None, before_day=None, until_page=None):
self.set_arguments(browser, platform, keyword_id, db_num, before_day, until_page)
def set_arguments(self, browser, platform, keyword_id, db_num, before_day, until_page):
if platform == "instagram":
self.crawler = instacrawl.InstaMainCrawler()
elif platform == "kakaochannel":
self.crawler = kakaocrawl.KakaoMainCrawler()
elif platform == "navercafe":
self.crawler = navercrawl.NaverCafeMainAreaCrawler()
elif platform == 'facebook':
self.crawler = facebookcrawlbs.FacebookMainCrawler()
elif platform == 'twitter':
self.crawler = twittercrawl.TwitterCrawler()
elif platform == 'youtube':
self.crawler = youtubecrawl.YoutubeMainCrawler()
else:
self.crawler = None
raise Exception
self.crawler.set_arguments(browser, keyword_id, db_num, before_day, until_page)
def start(self):
self.crawler.start()
browser_opt = ('chrome', "ie", "opera", "firefox")
platform_opt = ('instagram', 'kakaostory', 'navercafe', 'facebook', 'twitter', 'youtube')
def get_browser_info(platform_, file_name="browser.txt"):
if sys.platform == 'win32':
options = {'default': 'ie'}
else:
options = {'default': 'firefox'}
try:
with open(file_name, 'r') as f:
for line in f:
if line.startswith("#"):
continue
elif len(line.strip()) < 1:
continue
else:
platform, browser = line.split("=")
platform = platform.strip()
browser = browser.strip()
if (platform not in options.keys() and platform not in platform_opt) or browser not in browser_opt:
print_and_flush("check option: " + line)
else:
options[platform] = browser
except FileNotFoundError:
print_and_flush("browser.txt file is not exists")
print_and_flush("use " + options['default'] + " browser")
except Exception as e:
print_and_flush(e)
print_and_flush("Unknown error occurs")
exit(1)
return options.get(platform_, options['default'])
if __name__ == '__main__':
"""
sys.argv[0] webbasedcrawler.py
sys.argv[1] instagram, kakaochannel, navercafe, facebook, twitter, youtube
sys.argv[2] keyword_id
sys.argv[3] data group
sys.argv[4] start_day
sys.argv[5] until_page
"""
if len(sys.argv) == 6:
print_and_flush("Python Crawling Executed")
else:
print_and_flush("Check Argumenets!")
exit(1)
crawler = WebBasedCrawler(get_browser_info(sys.argv[1]), sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5])
crawler.start()
print_and_flush("Finished Crawling :)")
exit(0)