100 lines
3.5 KiB
Python
100 lines
3.5 KiB
Python
#-*- coding: utf-8 -*-
|
|
'''
|
|
Created on 2015. 12. 8.
|
|
|
|
@author: cococo
|
|
'''
|
|
import sys
|
|
|
|
from insta import instacrawl
|
|
from kakao import kakaocrawl
|
|
from naver import navercrawl
|
|
from facebook import facebookcrawl
|
|
from facebook import facebookcrawlbs
|
|
from twitter import twittercrawl
|
|
# from youtube import youtubecrawl
|
|
|
|
from base.baseclasses import print_and_flush
|
|
|
|
|
|
class WebBasedCrawler:
|
|
def __init__(self, browser=None, platform=None, keyword_id=None, db_num=None, before_day=None, until_page=None):
|
|
self.set_arguments(browser, platform, keyword_id, db_num, before_day, until_page)
|
|
|
|
def set_arguments(self, browser, platform, keyword_id, db_num, before_day, until_page):
|
|
if platform == "instagram":
|
|
self.crawler = instacrawl.InstaMainCrawler()
|
|
elif platform == "kakaochannel":
|
|
self.crawler = kakaocrawl.KakaoMainCrawler()
|
|
elif platform == "navercafe":
|
|
self.crawler = navercrawl.NaverCafeMainAreaCrawler()
|
|
elif platform == 'facebook':
|
|
self.crawler = facebookcrawlbs.FacebookMainCrawler()
|
|
elif platform == 'twitter':
|
|
self.crawler = twittercrawl.TwitterCrawler()
|
|
elif platform == 'youtube':
|
|
self.crawler = youtubecrawl.YoutubeMainCrawler()
|
|
else:
|
|
self.crawler = None
|
|
raise Exception
|
|
self.crawler.set_arguments(browser, keyword_id, db_num, before_day, until_page)
|
|
|
|
def start(self):
|
|
self.crawler.start()
|
|
|
|
|
|
browser_opt = ('chrome', "ie", "opera", "firefox")
|
|
platform_opt = ('instagram', 'kakaostory', 'navercafe', 'facebook', 'twitter', 'youtube')
|
|
|
|
|
|
def get_browser_info(platform_, file_name="browser.txt"):
|
|
if sys.platform == 'win32':
|
|
options = {'default': 'ie'}
|
|
else:
|
|
options = {'default': 'firefox'}
|
|
try:
|
|
with open(file_name, 'r') as f:
|
|
for line in f:
|
|
if line.startswith("#"):
|
|
continue
|
|
elif len(line.strip()) < 1:
|
|
continue
|
|
else:
|
|
platform, browser = line.split("=")
|
|
platform = platform.strip()
|
|
browser = browser.strip()
|
|
if (platform not in options.keys() and platform not in platform_opt) or browser not in browser_opt:
|
|
print_and_flush("check option: " + line)
|
|
else:
|
|
options[platform] = browser
|
|
except FileNotFoundError:
|
|
print_and_flush("browser.txt file is not exists")
|
|
print_and_flush("use " + options['default'] + " browser")
|
|
except Exception as e:
|
|
print_and_flush(e)
|
|
print_and_flush("Unknown error occurs")
|
|
exit(1)
|
|
return options.get(platform_, options['default'])
|
|
|
|
if __name__ == '__main__':
|
|
"""
|
|
sys.argv[0] webbasedcrawler.py
|
|
sys.argv[1] instagram, kakaochannel, navercafe, facebook, twitter, youtube
|
|
sys.argv[2] keyword_id
|
|
sys.argv[3] data group
|
|
sys.argv[4] start_day
|
|
sys.argv[5] until_page
|
|
"""
|
|
|
|
print("arguments: {}".format(' '.join(sys.argv)))
|
|
|
|
if len(sys.argv) == 6:
|
|
print_and_flush("Python Crawling Executed")
|
|
else:
|
|
print_and_flush("Check Argumenets!")
|
|
exit(1)
|
|
crawler = WebBasedCrawler(get_browser_info(sys.argv[1]), sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5])
|
|
crawler.start()
|
|
print_and_flush("Finished Crawling :)")
|
|
exit(0)
|