git-svn-id: svn://192.168.0.12/source@343 8346c931-da38-4b9b-9d4c-e48b93cbd075
This commit is contained in:
27
WebBasedCrawler/browser.txt
Normal file
27
WebBasedCrawler/browser.txt
Normal file
@@ -0,0 +1,27 @@
|
||||
# This file configures which browser you use when crawling
|
||||
|
||||
# Platforms are instagram, kakaostory, navercafe
|
||||
# You can use these options: chrome, firefox, opera, ie
|
||||
|
||||
# If you have installed chromedriver and Chrome in the same folder
|
||||
#platform=chrome
|
||||
|
||||
# If you use windows and have installed IEDriverServer.exe
|
||||
#platform=ie
|
||||
|
||||
# If you have installed Firefox
|
||||
#platform=firefox
|
||||
|
||||
# If you have installed OperaBrowser and operadriver
|
||||
#platform=opera
|
||||
|
||||
# You can also specify a browser option about each the platform
|
||||
# If browser option is empty in the each platform, the crawler will reference default
|
||||
# If browser.txt file is empty or not configured,
|
||||
# ie is default in Windows. firefox is default in Linux
|
||||
|
||||
default=chrome
|
||||
kakaostory=chrome
|
||||
#instagram=firefox
|
||||
navercafe=firefox
|
||||
#facebook=chrome
|
||||
5
WebBasedCrawler/effect.ini
Normal file
5
WebBasedCrawler/effect.ini
Normal file
@@ -0,0 +1,5 @@
|
||||
[database]
|
||||
user=root
|
||||
pass=1234
|
||||
host=192.168.0.82
|
||||
name=bigbird
|
||||
78
WebBasedCrawler/effectprocess.py
Normal file
78
WebBasedCrawler/effectprocess.py
Normal file
@@ -0,0 +1,78 @@
|
||||
import effect.effectinstagram
|
||||
import effect.effecterror
|
||||
import effect.effectkakaostory
|
||||
from base.baseclasses import printl
|
||||
import sys
|
||||
import base.baseclasses
|
||||
|
||||
browser_opt = ('chrome', "ie", "opera", "firefox")
|
||||
platform_opt = ('instagram', 'kakaostory', 'navercafe', "facebook")
|
||||
|
||||
|
||||
def get_browser_info(platform_, file_name="browser.txt"):
|
||||
if sys.platform == 'win32':
|
||||
options = {'default': 'ie'}
|
||||
else:
|
||||
options = {'default': 'firefox'}
|
||||
try:
|
||||
with open(file_name, 'r') as f:
|
||||
for line in f:
|
||||
if line.startswith("#"):
|
||||
continue
|
||||
elif len(line.strip()) < 1:
|
||||
continue
|
||||
else:
|
||||
platform, browser = line.split("=")
|
||||
platform = platform.strip()
|
||||
browser = browser.strip()
|
||||
if (platform not in options.keys() and platform not in platform_opt) or browser not in browser_opt:
|
||||
pass
|
||||
else:
|
||||
options[platform] = browser
|
||||
finally:
|
||||
return options.get(platform_, options['default'])
|
||||
|
||||
|
||||
def get_effect_process(platform_, event_num, url):
|
||||
if platform_ == 'instagram':
|
||||
return effect.effectinstagram.EffectInsta(int(event_num), int(event_num), url)
|
||||
|
||||
else:
|
||||
browser_info = get_browser_info(platform_)
|
||||
browser = base.baseclasses.Browser()
|
||||
driver = browser.get_new_driver(browser_info)
|
||||
if platform_ == 'kakaostory':
|
||||
return effect.effectkakaostory.EffectKakaostory(int(event_num), int(event_num), url, driver)
|
||||
else:
|
||||
return None
|
||||
|
||||
if __name__ == '__main__':
|
||||
"""
|
||||
sys.argv[0] effectprocess.py
|
||||
sys.argv[1] instagram, kakaostory, facebook
|
||||
sys.argv[2] event_num
|
||||
sys.argv[3] url
|
||||
"""
|
||||
|
||||
if len(sys.argv) != 4:
|
||||
printl("x!@#!@#!@#e010!@#check argument")
|
||||
exit(1)
|
||||
|
||||
try:
|
||||
effect_process = get_effect_process(sys.argv[1], sys.argv[2], sys.argv[3])
|
||||
effect_process.start()
|
||||
except effect.effecterror.EffectException as e:
|
||||
printl("x!@#" + str(sys.argv[2]) + "!@#" + str(sys.argv[3]) + "!@#" + str(e))
|
||||
if sys.argv[1] != 'instagram':
|
||||
effect_process.driver.close()
|
||||
exit(1)
|
||||
except Exception as e:
|
||||
printl("x!@#" + str(sys.argv[2]) + "!@#" + str(sys.argv[3]) + "!@#" + 'e012' + str(e))
|
||||
if sys.argv[1] != 'instagram':
|
||||
effect_process.driver.close()
|
||||
exit(1)
|
||||
|
||||
printl("o!@#" + str(sys.argv[2]) + "!@#" + str(sys.argv[3]))
|
||||
if sys.argv[1] != 'instagram':
|
||||
effect_process.driver.close()
|
||||
exit(0)
|
||||
45
WebBasedCrawler/rankcheck.py
Normal file
45
WebBasedCrawler/rankcheck.py
Normal file
@@ -0,0 +1,45 @@
|
||||
import base.baseclasses
|
||||
import time
|
||||
from selenium.webdriver.common.keys import Keys
|
||||
|
||||
base_url = 'https://search.naver.com/search.naver?where=post&sm=tab_jum&ie=utf8&query='
|
||||
keywords = ['vsl', '유산균']
|
||||
|
||||
|
||||
if '__main__' == __name__:
|
||||
browser = base.baseclasses.Browser()
|
||||
driver = browser.new_firefox_browser()
|
||||
for keyword in keywords:
|
||||
driver.get(base_url + keyword)
|
||||
time.sleep(10)
|
||||
rank = 1
|
||||
with open(keyword + time.strftime("%Y%m%d_%H%M%S") + ".txt", 'w') as f:
|
||||
try:
|
||||
for i in range(1, 101):
|
||||
ul = driver.find_element_by_css_selector("ul[class^='type']")
|
||||
lis = ul.find_elements_by_css_selector("li[class='sh_blog_top']")
|
||||
for li in lis:
|
||||
try:
|
||||
a = li.find_element_by_xpath("div/a")
|
||||
except:
|
||||
a = li.find_element_by_xpath("dl/dt/a")
|
||||
href = a.get_attribute('href')
|
||||
href = href.replace("?Redirect=Log&logNo=", "/")
|
||||
f.write("{0}: {1}\n".format(rank, href))
|
||||
print("{0}: {1}".format(rank, href))
|
||||
f.flush()
|
||||
rank += 1
|
||||
div_paging = driver.find_element_by_css_selector("div[class='paging']")
|
||||
pages = div_paging.find_elements_by_css_selector("*")
|
||||
clickable = False
|
||||
for j in pages:
|
||||
if j.tag_name == "strong":
|
||||
clickable = True
|
||||
elif clickable and j.tag_name == "a":
|
||||
j.send_keys(Keys.NULL)
|
||||
j.send_keys(Keys.ENTER)
|
||||
time.sleep(10)
|
||||
break
|
||||
except:
|
||||
pass
|
||||
driver.quit()
|
||||
40
WebBasedCrawler/rankcheckin.py
Normal file
40
WebBasedCrawler/rankcheckin.py
Normal file
@@ -0,0 +1,40 @@
|
||||
#!/usr/bin/python3
|
||||
import sys
|
||||
import os.path
|
||||
|
||||
if "__main__" == __name__:
|
||||
if not (len(sys.argv) == 3 or len(sys.argv) == 4):
|
||||
print("Usage : python {0} file1 file2 [ranknum]".format(sys.argv[0]))
|
||||
exit(1)
|
||||
|
||||
if not os.path.isfile(sys.argv[1]) or not os.path.isfile(sys.argv[2]):
|
||||
print("check files :\nfile1 : {0}\nfile2 : {1}".format(sys.argv[1], sys.argv[2]))
|
||||
exit(1)
|
||||
|
||||
rank1 = []
|
||||
rank2 = []
|
||||
|
||||
if len(sys.argv) == 4 and sys.argv[3].isnumeric():
|
||||
rank = int(sys.argv[3])
|
||||
else:
|
||||
rank = 1000
|
||||
|
||||
with open(sys.argv[1]) as f:
|
||||
for line in f:
|
||||
rank1.append(line[line.index('http'):].replace('\n', ''))
|
||||
|
||||
with open(sys.argv[2]) as f:
|
||||
for line in f:
|
||||
rank2.append(line[line.index('http'):].replace('\n', ''))
|
||||
|
||||
count = 0
|
||||
|
||||
if rank <= len(rank1) and rank <= len(rank2):
|
||||
for url in rank1[:rank]:
|
||||
if url in rank2[:rank]:
|
||||
count += 1
|
||||
else:
|
||||
for url in rank1:
|
||||
if url in rank2:
|
||||
count += 1
|
||||
print(count)
|
||||
19
column.txt
Normal file
19
column.txt
Normal file
@@ -0,0 +1,19 @@
|
||||
#데이타 베이스명,컬럼 명,Date 검색,Count
|
||||
platform_name,플랫폼 이름,x,x
|
||||
platform_form,플롯폼,x,x
|
||||
article_form,종류,x,x
|
||||
article_id,아이디,x,o
|
||||
article_nickname,닉네임,x,o
|
||||
article_date,날짜,o,o
|
||||
article_title,타이틀,x,x
|
||||
article_data,데이타,x,x
|
||||
platform_id,플랫폼 아이디,x,o
|
||||
platform_title,플랫폼 타이틀,x,x
|
||||
article_url,주소,x,
|
||||
article_parent,상위 댓글 작성자,x,o
|
||||
article_order,댓글 순서,x,x
|
||||
reply_url,다른 주소,x,
|
||||
article_hit,조회수,x,x
|
||||
keyword_id,검색어 번호,x
|
||||
article_profileurl,프로파일 주소,x
|
||||
article_profile,프로파일,x
|
||||
Reference in New Issue
Block a user