git-svn-id: svn://192.168.0.12/source@343 8346c931-da38-4b9b-9d4c-e48b93cbd075

2017-01-19 10:30:58 +00:00
parent 53d5da70de
commit 8ec8a773ad
6 changed files with 214 additions and 0 deletions
--- a/WebBasedCrawler/browser.txt
+++ b/WebBasedCrawler/browser.txt
@@ -0,0 +1,27 @@
+# This file configures which browser you use when crawling
+
+# Platforms are instagram, kakaostory, navercafe
+# You can use these options: chrome, firefox, opera, ie
+
+# If you have installed chromedriver and Chrome in the same folder
+#platform=chrome
+
+# If you use windows and have installed IEDriverServer.exe
+#platform=ie
+
+# If you have installed Firefox
+#platform=firefox
+
+# If you have installed OperaBrowser and operadriver
+#platform=opera
+
+# You can also specify a browser option about each the platform
+# If browser option is empty in the each platform, the crawler will reference default
+# If browser.txt file is empty or not configured, 
+# ie is default in Windows. firefox is default in Linux 
+
+default=chrome
+kakaostory=chrome
+#instagram=firefox
+navercafe=firefox
+#facebook=chrome
--- a/WebBasedCrawler/effect.ini
+++ b/WebBasedCrawler/effect.ini
@@ -0,0 +1,5 @@
+[database]
+user=root
+pass=1234
+host=192.168.0.82
+name=bigbird
--- a/WebBasedCrawler/effectprocess.py
+++ b/WebBasedCrawler/effectprocess.py
@@ -0,0 +1,78 @@
+import effect.effectinstagram
+import effect.effecterror
+import effect.effectkakaostory
+from base.baseclasses import printl
+import sys
+import base.baseclasses
+
+browser_opt = ('chrome', "ie", "opera", "firefox")
+platform_opt = ('instagram', 'kakaostory', 'navercafe', "facebook")
+
+
+def get_browser_info(platform_, file_name="browser.txt"):
+    if sys.platform == 'win32':
+        options = {'default': 'ie'}
+    else:
+        options = {'default': 'firefox'}
+    try:
+        with open(file_name, 'r') as f:
+            for line in f:
+                if line.startswith("#"):
+                    continue
+                elif len(line.strip()) < 1:
+                    continue
+                else:
+                    platform, browser = line.split("=")
+                    platform = platform.strip()
+                    browser = browser.strip()
+                    if (platform not in options.keys() and platform not in platform_opt) or browser not in browser_opt:
+                        pass
+                    else:
+                        options[platform] = browser
+    finally:
+        return options.get(platform_, options['default'])
+
+
+def get_effect_process(platform_, event_num, url):
+    if platform_ == 'instagram':
+        return effect.effectinstagram.EffectInsta(int(event_num), int(event_num), url)
+
+    else:
+        browser_info = get_browser_info(platform_)
+        browser = base.baseclasses.Browser()
+        driver = browser.get_new_driver(browser_info)
+        if platform_ == 'kakaostory':
+            return effect.effectkakaostory.EffectKakaostory(int(event_num), int(event_num), url, driver)
+        else:
+            return None
+
+if __name__ == '__main__':
+    """
+    sys.argv[0] effectprocess.py
+    sys.argv[1] instagram, kakaostory, facebook
+    sys.argv[2] event_num
+    sys.argv[3] url
+    """
+
+    if len(sys.argv) != 4:
+        printl("x!@#!@#!@#e010!@#check argument")
+        exit(1)
+
+    try:
+        effect_process = get_effect_process(sys.argv[1], sys.argv[2], sys.argv[3])
+        effect_process.start()
+    except effect.effecterror.EffectException as e:
+        printl("x!@#" + str(sys.argv[2]) + "!@#" + str(sys.argv[3]) + "!@#" + str(e))
+        if sys.argv[1] != 'instagram':
+            effect_process.driver.close()
+        exit(1)
+    except Exception as e:
+        printl("x!@#" + str(sys.argv[2]) + "!@#" + str(sys.argv[3]) + "!@#" + 'e012' + str(e))
+        if sys.argv[1] != 'instagram':
+            effect_process.driver.close()
+        exit(1)
+
+    printl("o!@#" + str(sys.argv[2]) + "!@#" + str(sys.argv[3]))
+    if sys.argv[1] != 'instagram':
+        effect_process.driver.close()
+    exit(0)
--- a/WebBasedCrawler/rankcheck.py
+++ b/WebBasedCrawler/rankcheck.py
@@ -0,0 +1,45 @@
+import base.baseclasses
+import time
+from selenium.webdriver.common.keys import Keys
+
+base_url = 'https://search.naver.com/search.naver?where=post&sm=tab_jum&ie=utf8&query='
+keywords = ['vsl', '유산균']
+
+
+if '__main__' == __name__:
+    browser = base.baseclasses.Browser()
+    driver = browser.new_firefox_browser()
+    for keyword in keywords:
+        driver.get(base_url + keyword)
+        time.sleep(10)
+        rank = 1
+        with open(keyword + time.strftime("%Y%m%d_%H%M%S") + ".txt", 'w') as f:
+            try:
+                for i in range(1, 101):
+                    ul = driver.find_element_by_css_selector("ul[class^='type']")
+                    lis = ul.find_elements_by_css_selector("li[class='sh_blog_top']")
+                    for li in lis:
+                        try:
+                            a = li.find_element_by_xpath("div/a")
+                        except:
+                            a = li.find_element_by_xpath("dl/dt/a")
+                        href = a.get_attribute('href')
+                        href = href.replace("?Redirect=Log&logNo=", "/")
+                        f.write("{0}: {1}\n".format(rank, href))
+                        print("{0}: {1}".format(rank, href))
+                        f.flush()
+                        rank += 1
+                    div_paging = driver.find_element_by_css_selector("div[class='paging']")
+                    pages = div_paging.find_elements_by_css_selector("*")
+                    clickable = False
+                    for j in pages:
+                        if j.tag_name == "strong":
+                            clickable = True
+                        elif clickable and j.tag_name == "a":
+                            j.send_keys(Keys.NULL)
+                            j.send_keys(Keys.ENTER)
+                            time.sleep(10)
+                            break
+            except:
+                pass
+    driver.quit()
--- a/WebBasedCrawler/rankcheckin.py
+++ b/WebBasedCrawler/rankcheckin.py
@@ -0,0 +1,40 @@
+#!/usr/bin/python3
+import sys
+import os.path
+
+if "__main__" == __name__:
+    if not (len(sys.argv) == 3 or len(sys.argv) == 4):
+        print("Usage : python {0} file1 file2 [ranknum]".format(sys.argv[0]))
+        exit(1)
+
+    if not os.path.isfile(sys.argv[1]) or not os.path.isfile(sys.argv[2]):
+        print("check files :\nfile1 : {0}\nfile2 : {1}".format(sys.argv[1], sys.argv[2]))
+        exit(1)
+
+    rank1 = []
+    rank2 = []
+
+    if len(sys.argv) == 4 and sys.argv[3].isnumeric():
+        rank = int(sys.argv[3])
+    else:
+        rank = 1000
+
+    with open(sys.argv[1]) as f:
+        for line in f:
+            rank1.append(line[line.index('http'):].replace('\n', ''))
+
+    with open(sys.argv[2]) as f:
+        for line in f:
+            rank2.append(line[line.index('http'):].replace('\n', ''))
+
+    count = 0
+
+    if rank <= len(rank1) and rank <= len(rank2):
+        for url in rank1[:rank]:
+            if url in rank2[:rank]:
+                count += 1
+    else:
+        for url in rank1:
+            if url in rank2:
+                count += 1
+    print(count)
--- a/column.txt
+++ b/column.txt
@@ -0,0 +1,19 @@
+#데이타 베이스명,컬럼 명,Date 검색,Count
+platform_name,플랫폼 이름,x,x
+platform_form,플롯폼,x,x
+article_form,종류,x,x
+article_id,아이디,x,o
+article_nickname,닉네임,x,o
+article_date,날짜,o,o
+article_title,타이틀,x,x
+article_data,데이타,x,x
+platform_id,플랫폼 아이디,x,o
+platform_title,플랫폼 타이틀,x,x
+article_url,주소,x,
+article_parent,상위 댓글 작성자,x,o
+article_order,댓글 순서,x,x
+reply_url,다른 주소,x,
+article_hit,조회수,x,x
+keyword_id,검색어 번호,x
+article_profileurl,프로파일 주소,x
+article_profile,프로파일,x