diff --git a/WebBasedCrawler/browser.txt b/WebBasedCrawler/browser.txt new file mode 100644 index 0000000..25b2489 --- /dev/null +++ b/WebBasedCrawler/browser.txt @@ -0,0 +1,27 @@ +# This file configures which browser you use when crawling + +# Platforms are instagram, kakaostory, navercafe +# You can use these options: chrome, firefox, opera, ie + +# If you have installed chromedriver and Chrome in the same folder +#platform=chrome + +# If you use windows and have installed IEDriverServer.exe +#platform=ie + +# If you have installed Firefox +#platform=firefox + +# If you have installed OperaBrowser and operadriver +#platform=opera + +# You can also specify a browser option about each the platform +# If browser option is empty in the each platform, the crawler will reference default +# If browser.txt file is empty or not configured, +# ie is default in Windows. firefox is default in Linux + +default=chrome +kakaostory=chrome +#instagram=firefox +navercafe=firefox +#facebook=chrome diff --git a/WebBasedCrawler/effect.ini b/WebBasedCrawler/effect.ini new file mode 100644 index 0000000..7500cb9 --- /dev/null +++ b/WebBasedCrawler/effect.ini @@ -0,0 +1,5 @@ +[database] +user=root +pass=1234 +host=192.168.0.82 +name=bigbird \ No newline at end of file diff --git a/WebBasedCrawler/effectprocess.py b/WebBasedCrawler/effectprocess.py new file mode 100644 index 0000000..9333fa1 --- /dev/null +++ b/WebBasedCrawler/effectprocess.py @@ -0,0 +1,78 @@ +import effect.effectinstagram +import effect.effecterror +import effect.effectkakaostory +from base.baseclasses import printl +import sys +import base.baseclasses + +browser_opt = ('chrome', "ie", "opera", "firefox") +platform_opt = ('instagram', 'kakaostory', 'navercafe', "facebook") + + +def get_browser_info(platform_, file_name="browser.txt"): + if sys.platform == 'win32': + options = {'default': 'ie'} + else: + options = {'default': 'firefox'} + try: + with open(file_name, 'r') as f: + for line in f: + if line.startswith("#"): + continue + elif len(line.strip()) < 1: + continue + else: + platform, browser = line.split("=") + platform = platform.strip() + browser = browser.strip() + if (platform not in options.keys() and platform not in platform_opt) or browser not in browser_opt: + pass + else: + options[platform] = browser + finally: + return options.get(platform_, options['default']) + + +def get_effect_process(platform_, event_num, url): + if platform_ == 'instagram': + return effect.effectinstagram.EffectInsta(int(event_num), int(event_num), url) + + else: + browser_info = get_browser_info(platform_) + browser = base.baseclasses.Browser() + driver = browser.get_new_driver(browser_info) + if platform_ == 'kakaostory': + return effect.effectkakaostory.EffectKakaostory(int(event_num), int(event_num), url, driver) + else: + return None + +if __name__ == '__main__': + """ + sys.argv[0] effectprocess.py + sys.argv[1] instagram, kakaostory, facebook + sys.argv[2] event_num + sys.argv[3] url + """ + + if len(sys.argv) != 4: + printl("x!@#!@#!@#e010!@#check argument") + exit(1) + + try: + effect_process = get_effect_process(sys.argv[1], sys.argv[2], sys.argv[3]) + effect_process.start() + except effect.effecterror.EffectException as e: + printl("x!@#" + str(sys.argv[2]) + "!@#" + str(sys.argv[3]) + "!@#" + str(e)) + if sys.argv[1] != 'instagram': + effect_process.driver.close() + exit(1) + except Exception as e: + printl("x!@#" + str(sys.argv[2]) + "!@#" + str(sys.argv[3]) + "!@#" + 'e012' + str(e)) + if sys.argv[1] != 'instagram': + effect_process.driver.close() + exit(1) + + printl("o!@#" + str(sys.argv[2]) + "!@#" + str(sys.argv[3])) + if sys.argv[1] != 'instagram': + effect_process.driver.close() + exit(0) diff --git a/WebBasedCrawler/rankcheck.py b/WebBasedCrawler/rankcheck.py new file mode 100644 index 0000000..b6afb8d --- /dev/null +++ b/WebBasedCrawler/rankcheck.py @@ -0,0 +1,45 @@ +import base.baseclasses +import time +from selenium.webdriver.common.keys import Keys + +base_url = 'https://search.naver.com/search.naver?where=post&sm=tab_jum&ie=utf8&query=' +keywords = ['vsl', '유산균'] + + +if '__main__' == __name__: + browser = base.baseclasses.Browser() + driver = browser.new_firefox_browser() + for keyword in keywords: + driver.get(base_url + keyword) + time.sleep(10) + rank = 1 + with open(keyword + time.strftime("%Y%m%d_%H%M%S") + ".txt", 'w') as f: + try: + for i in range(1, 101): + ul = driver.find_element_by_css_selector("ul[class^='type']") + lis = ul.find_elements_by_css_selector("li[class='sh_blog_top']") + for li in lis: + try: + a = li.find_element_by_xpath("div/a") + except: + a = li.find_element_by_xpath("dl/dt/a") + href = a.get_attribute('href') + href = href.replace("?Redirect=Log&logNo=", "/") + f.write("{0}: {1}\n".format(rank, href)) + print("{0}: {1}".format(rank, href)) + f.flush() + rank += 1 + div_paging = driver.find_element_by_css_selector("div[class='paging']") + pages = div_paging.find_elements_by_css_selector("*") + clickable = False + for j in pages: + if j.tag_name == "strong": + clickable = True + elif clickable and j.tag_name == "a": + j.send_keys(Keys.NULL) + j.send_keys(Keys.ENTER) + time.sleep(10) + break + except: + pass + driver.quit() \ No newline at end of file diff --git a/WebBasedCrawler/rankcheckin.py b/WebBasedCrawler/rankcheckin.py new file mode 100644 index 0000000..7bd4eb1 --- /dev/null +++ b/WebBasedCrawler/rankcheckin.py @@ -0,0 +1,40 @@ +#!/usr/bin/python3 +import sys +import os.path + +if "__main__" == __name__: + if not (len(sys.argv) == 3 or len(sys.argv) == 4): + print("Usage : python {0} file1 file2 [ranknum]".format(sys.argv[0])) + exit(1) + + if not os.path.isfile(sys.argv[1]) or not os.path.isfile(sys.argv[2]): + print("check files :\nfile1 : {0}\nfile2 : {1}".format(sys.argv[1], sys.argv[2])) + exit(1) + + rank1 = [] + rank2 = [] + + if len(sys.argv) == 4 and sys.argv[3].isnumeric(): + rank = int(sys.argv[3]) + else: + rank = 1000 + + with open(sys.argv[1]) as f: + for line in f: + rank1.append(line[line.index('http'):].replace('\n', '')) + + with open(sys.argv[2]) as f: + for line in f: + rank2.append(line[line.index('http'):].replace('\n', '')) + + count = 0 + + if rank <= len(rank1) and rank <= len(rank2): + for url in rank1[:rank]: + if url in rank2[:rank]: + count += 1 + else: + for url in rank1: + if url in rank2: + count += 1 + print(count) diff --git a/column.txt b/column.txt new file mode 100644 index 0000000..0b1f117 --- /dev/null +++ b/column.txt @@ -0,0 +1,19 @@ +#데이타 베이스명,컬럼 명,Date 검색,Count +platform_name,플랫폼 이름,x,x +platform_form,플롯폼,x,x +article_form,종류,x,x +article_id,아이디,x,o +article_nickname,닉네임,x,o +article_date,날짜,o,o +article_title,타이틀,x,x +article_data,데이타,x,x +platform_id,플랫폼 아이디,x,o +platform_title,플랫폼 타이틀,x,x +article_url,주소,x, +article_parent,상위 댓글 작성자,x,o +article_order,댓글 순서,x,x +reply_url,다른 주소,x, +article_hit,조회수,x,x +keyword_id,검색어 번호,x +article_profileurl,프로파일 주소,x +article_profile,프로파일,x \ No newline at end of file