git-svn-id: svn://192.168.0.12/source@343 8346c931-da38-4b9b-9d4c-e48b93cbd075

2017-01-19 10:30:58 +00:00
parent 53d5da70de
commit 8ec8a773ad
6 changed files with 214 additions and 0 deletions
--- a/WebBasedCrawler/browser.txt
+++ b/WebBasedCrawler/browser.txt
@@ -0,0 +1,27 @@
 # This file configures which browser you use when crawling
 # Platforms are instagram, kakaostory, navercafe
 # You can use these options: chrome, firefox, opera, ie
 # If you have installed chromedriver and Chrome in the same folder
 #platform=chrome
 # If you use windows and have installed IEDriverServer.exe
 #platform=ie
 # If you have installed Firefox
 #platform=firefox
 # If you have installed OperaBrowser and operadriver
 #platform=opera
 # You can also specify a browser option about each the platform
 # If browser option is empty in the each platform, the crawler will reference default
 # If browser.txt file is empty or not configured, 
 # ie is default in Windows. firefox is default in Linux 
 default=chrome
 kakaostory=chrome
 #instagram=firefox
 navercafe=firefox
 #facebook=chrome
--- a/WebBasedCrawler/effect.ini
+++ b/WebBasedCrawler/effect.ini
@@ -0,0 +1,5 @@
 [database]
 user=root
 pass=1234
 host=192.168.0.82
 name=bigbird
--- a/WebBasedCrawler/effectprocess.py
+++ b/WebBasedCrawler/effectprocess.py
@@ -0,0 +1,78 @@
 import effect.effectinstagram
 import effect.effecterror
 import effect.effectkakaostory
 from base.baseclasses import printl
 import sys
 import base.baseclasses
 browser_opt = ('chrome', "ie", "opera", "firefox")
 platform_opt = ('instagram', 'kakaostory', 'navercafe', "facebook")
 def get_browser_info(platform_, file_name="browser.txt"):
    if sys.platform == 'win32':
        options = {'default': 'ie'}
    else:
        options = {'default': 'firefox'}
    try:
        with open(file_name, 'r') as f:
            for line in f:
                if line.startswith("#"):
                    continue
                elif len(line.strip()) < 1:
                    continue
                else:
                    platform, browser = line.split("=")
                    platform = platform.strip()
                    browser = browser.strip()
                    if (platform not in options.keys() and platform not in platform_opt) or browser not in browser_opt:
                        pass
                    else:
                        options[platform] = browser
    finally:
        return options.get(platform_, options['default'])
 def get_effect_process(platform_, event_num, url):
    if platform_ == 'instagram':
        return effect.effectinstagram.EffectInsta(int(event_num), int(event_num), url)
    else:
        browser_info = get_browser_info(platform_)
        browser = base.baseclasses.Browser()
        driver = browser.get_new_driver(browser_info)
        if platform_ == 'kakaostory':
            return effect.effectkakaostory.EffectKakaostory(int(event_num), int(event_num), url, driver)
        else:
            return None
 if __name__ == '__main__':
    """
    sys.argv[0] effectprocess.py
    sys.argv[1] instagram, kakaostory, facebook
    sys.argv[2] event_num
    sys.argv[3] url
    """
    if len(sys.argv) != 4:
        printl("x!@#!@#!@#e010!@#check argument")
        exit(1)
    try:
        effect_process = get_effect_process(sys.argv[1], sys.argv[2], sys.argv[3])
        effect_process.start()
    except effect.effecterror.EffectException as e:
        printl("x!@#" + str(sys.argv[2]) + "!@#" + str(sys.argv[3]) + "!@#" + str(e))
        if sys.argv[1] != 'instagram':
            effect_process.driver.close()
        exit(1)
    except Exception as e:
        printl("x!@#" + str(sys.argv[2]) + "!@#" + str(sys.argv[3]) + "!@#" + 'e012' + str(e))
        if sys.argv[1] != 'instagram':
            effect_process.driver.close()
        exit(1)
    printl("o!@#" + str(sys.argv[2]) + "!@#" + str(sys.argv[3]))
    if sys.argv[1] != 'instagram':
        effect_process.driver.close()
    exit(0)
--- a/WebBasedCrawler/rankcheck.py
+++ b/WebBasedCrawler/rankcheck.py
@@ -0,0 +1,45 @@
 import base.baseclasses
 import time
 from selenium.webdriver.common.keys import Keys
 base_url = 'https://search.naver.com/search.naver?where=post&sm=tab_jum&ie=utf8&query='
 keywords = ['vsl', '유산균']
 if '__main__' == __name__:
    browser = base.baseclasses.Browser()
    driver = browser.new_firefox_browser()
    for keyword in keywords:
        driver.get(base_url + keyword)
        time.sleep(10)
        rank = 1
        with open(keyword + time.strftime("%Y%m%d_%H%M%S") + ".txt", 'w') as f:
            try:
                for i in range(1, 101):
                    ul = driver.find_element_by_css_selector("ul[class^='type']")
                    lis = ul.find_elements_by_css_selector("li[class='sh_blog_top']")
                    for li in lis:
                        try:
                            a = li.find_element_by_xpath("div/a")
                        except:
                            a = li.find_element_by_xpath("dl/dt/a")
                        href = a.get_attribute('href')
                        href = href.replace("?Redirect=Log&logNo=", "/")
                        f.write("{0}: {1}\n".format(rank, href))
                        print("{0}: {1}".format(rank, href))
                        f.flush()
                        rank += 1
                    div_paging = driver.find_element_by_css_selector("div[class='paging']")
                    pages = div_paging.find_elements_by_css_selector("*")
                    clickable = False
                    for j in pages:
                        if j.tag_name == "strong":
                            clickable = True
                        elif clickable and j.tag_name == "a":
                            j.send_keys(Keys.NULL)
                            j.send_keys(Keys.ENTER)
                            time.sleep(10)
                            break
            except:
                pass
    driver.quit()
--- a/WebBasedCrawler/rankcheckin.py
+++ b/WebBasedCrawler/rankcheckin.py
@@ -0,0 +1,40 @@
 #!/usr/bin/python3
 import sys
 import os.path
 if "__main__" == __name__:
    if not (len(sys.argv) == 3 or len(sys.argv) == 4):
        print("Usage : python {0} file1 file2 [ranknum]".format(sys.argv[0]))
        exit(1)
    if not os.path.isfile(sys.argv[1]) or not os.path.isfile(sys.argv[2]):
        print("check files :\nfile1 : {0}\nfile2 : {1}".format(sys.argv[1], sys.argv[2]))
        exit(1)
    rank1 = []
    rank2 = []
    if len(sys.argv) == 4 and sys.argv[3].isnumeric():
        rank = int(sys.argv[3])
    else:
        rank = 1000
    with open(sys.argv[1]) as f:
        for line in f:
            rank1.append(line[line.index('http'):].replace('\n', ''))
    with open(sys.argv[2]) as f:
        for line in f:
            rank2.append(line[line.index('http'):].replace('\n', ''))
    count = 0
    if rank <= len(rank1) and rank <= len(rank2):
        for url in rank1[:rank]:
            if url in rank2[:rank]:
                count += 1
    else:
        for url in rank1:
            if url in rank2:
                count += 1
    print(count)
--- a/column.txt
+++ b/column.txt
@@ -0,0 +1,19 @@
 #데이타 베이스명,컬럼 명,Date 검색,Count
 platform_name,플랫폼 이름,x,x
 platform_form,플롯폼,x,x
 article_form,종류,x,x
 article_id,아이디,x,o
 article_nickname,닉네임,x,o
 article_date,날짜,o,o
 article_title,타이틀,x,x
 article_data,데이타,x,x
 platform_id,플랫폼 아이디,x,o
 platform_title,플랫폼 타이틀,x,x
 article_url,주소,x,
 article_parent,상위 댓글 작성자,x,o
 article_order,댓글 순서,x,x
 reply_url,다른 주소,x,
 article_hit,조회수,x,x
 keyword_id,검색어 번호,x
 article_profileurl,프로파일 주소,x
 article_profile,프로파일,x