Files
clients/WebBasedCrawler/webbasedcrawler.py
mjjo 1fb61f0b4c 트위터 크롤러 수정
- 프록시를 porxy2 db에 넣고 사용
2017-08-09 15:32:57 +09:00

98 lines
3.4 KiB
Python

#-*- coding: utf-8 -*-
'''
Created on 2015. 12. 8.
@author: cococo
'''
import sys
from insta import instacrawl
from kakao import kakaocrawl
from naver import navercrawl
from facebook import facebookcrawl
from facebook import facebookcrawlbs
from twitter import twittercrawl
# from youtube import youtubecrawl
from base.baseclasses import print_and_flush
class WebBasedCrawler:
def __init__(self, browser=None, platform=None, keyword_id=None, db_num=None, before_day=None, until_page=None):
self.set_arguments(browser, platform, keyword_id, db_num, before_day, until_page)
def set_arguments(self, browser, platform, keyword_id, db_num, before_day, until_page):
if platform == "instagram":
self.crawler = instacrawl.InstaMainCrawler()
elif platform == "kakaochannel":
self.crawler = kakaocrawl.KakaoMainCrawler()
elif platform == "navercafe":
self.crawler = navercrawl.NaverCafeMainAreaCrawler()
elif platform == 'facebook':
self.crawler = facebookcrawlbs.FacebookMainCrawler()
elif platform == 'twitter':
self.crawler = twittercrawl.TwitterCrawler()
elif platform == 'youtube':
self.crawler = youtubecrawl.YoutubeMainCrawler()
else:
self.crawler = None
raise Exception
self.crawler.set_arguments(browser, keyword_id, db_num, before_day, until_page)
def start(self):
self.crawler.start()
browser_opt = ('chrome', "ie", "opera", "firefox")
platform_opt = ('instagram', 'kakaostory', 'navercafe', 'facebook', 'twitter', 'youtube')
def get_browser_info(platform_, file_name="browser.txt"):
if sys.platform == 'win32':
options = {'default': 'ie'}
else:
options = {'default': 'firefox'}
try:
with open(file_name, 'r') as f:
for line in f:
if line.startswith("#"):
continue
elif len(line.strip()) < 1:
continue
else:
platform, browser = line.split("=")
platform = platform.strip()
browser = browser.strip()
if (platform not in options.keys() and platform not in platform_opt) or browser not in browser_opt:
print_and_flush("check option: " + line)
else:
options[platform] = browser
except FileNotFoundError:
print_and_flush("browser.txt file is not exists")
print_and_flush("use " + options['default'] + " browser")
except Exception as e:
print_and_flush(e)
print_and_flush("Unknown error occurs")
exit(1)
return options.get(platform_, options['default'])
if __name__ == '__main__':
"""
sys.argv[0] webbasedcrawler.py
sys.argv[1] instagram, kakaochannel, navercafe, facebook, twitter, youtube
sys.argv[2] keyword_id
sys.argv[3] data group
sys.argv[4] start_day
sys.argv[5] until_page
"""
if len(sys.argv) == 6:
print_and_flush("Python Crawling Executed")
else:
print_and_flush("Check Argumenets!")
exit(1)
crawler = WebBasedCrawler(get_browser_info(sys.argv[1]), sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5])
crawler.start()
print_and_flush("Finished Crawling :)")
exit(0)