git-svn-id: svn://192.168.0.12/source@343 8346c931-da38-4b9b-9d4c-e48b93cbd075
This commit is contained in:
27
WebBasedCrawler/browser.txt
Normal file
27
WebBasedCrawler/browser.txt
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
# This file configures which browser you use when crawling
|
||||||
|
|
||||||
|
# Platforms are instagram, kakaostory, navercafe
|
||||||
|
# You can use these options: chrome, firefox, opera, ie
|
||||||
|
|
||||||
|
# If you have installed chromedriver and Chrome in the same folder
|
||||||
|
#platform=chrome
|
||||||
|
|
||||||
|
# If you use windows and have installed IEDriverServer.exe
|
||||||
|
#platform=ie
|
||||||
|
|
||||||
|
# If you have installed Firefox
|
||||||
|
#platform=firefox
|
||||||
|
|
||||||
|
# If you have installed OperaBrowser and operadriver
|
||||||
|
#platform=opera
|
||||||
|
|
||||||
|
# You can also specify a browser option about each the platform
|
||||||
|
# If browser option is empty in the each platform, the crawler will reference default
|
||||||
|
# If browser.txt file is empty or not configured,
|
||||||
|
# ie is default in Windows. firefox is default in Linux
|
||||||
|
|
||||||
|
default=chrome
|
||||||
|
kakaostory=chrome
|
||||||
|
#instagram=firefox
|
||||||
|
navercafe=firefox
|
||||||
|
#facebook=chrome
|
||||||
5
WebBasedCrawler/effect.ini
Normal file
5
WebBasedCrawler/effect.ini
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
[database]
|
||||||
|
user=root
|
||||||
|
pass=1234
|
||||||
|
host=192.168.0.82
|
||||||
|
name=bigbird
|
||||||
78
WebBasedCrawler/effectprocess.py
Normal file
78
WebBasedCrawler/effectprocess.py
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
import effect.effectinstagram
|
||||||
|
import effect.effecterror
|
||||||
|
import effect.effectkakaostory
|
||||||
|
from base.baseclasses import printl
|
||||||
|
import sys
|
||||||
|
import base.baseclasses
|
||||||
|
|
||||||
|
browser_opt = ('chrome', "ie", "opera", "firefox")
|
||||||
|
platform_opt = ('instagram', 'kakaostory', 'navercafe', "facebook")
|
||||||
|
|
||||||
|
|
||||||
|
def get_browser_info(platform_, file_name="browser.txt"):
|
||||||
|
if sys.platform == 'win32':
|
||||||
|
options = {'default': 'ie'}
|
||||||
|
else:
|
||||||
|
options = {'default': 'firefox'}
|
||||||
|
try:
|
||||||
|
with open(file_name, 'r') as f:
|
||||||
|
for line in f:
|
||||||
|
if line.startswith("#"):
|
||||||
|
continue
|
||||||
|
elif len(line.strip()) < 1:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
platform, browser = line.split("=")
|
||||||
|
platform = platform.strip()
|
||||||
|
browser = browser.strip()
|
||||||
|
if (platform not in options.keys() and platform not in platform_opt) or browser not in browser_opt:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
options[platform] = browser
|
||||||
|
finally:
|
||||||
|
return options.get(platform_, options['default'])
|
||||||
|
|
||||||
|
|
||||||
|
def get_effect_process(platform_, event_num, url):
|
||||||
|
if platform_ == 'instagram':
|
||||||
|
return effect.effectinstagram.EffectInsta(int(event_num), int(event_num), url)
|
||||||
|
|
||||||
|
else:
|
||||||
|
browser_info = get_browser_info(platform_)
|
||||||
|
browser = base.baseclasses.Browser()
|
||||||
|
driver = browser.get_new_driver(browser_info)
|
||||||
|
if platform_ == 'kakaostory':
|
||||||
|
return effect.effectkakaostory.EffectKakaostory(int(event_num), int(event_num), url, driver)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
"""
|
||||||
|
sys.argv[0] effectprocess.py
|
||||||
|
sys.argv[1] instagram, kakaostory, facebook
|
||||||
|
sys.argv[2] event_num
|
||||||
|
sys.argv[3] url
|
||||||
|
"""
|
||||||
|
|
||||||
|
if len(sys.argv) != 4:
|
||||||
|
printl("x!@#!@#!@#e010!@#check argument")
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
try:
|
||||||
|
effect_process = get_effect_process(sys.argv[1], sys.argv[2], sys.argv[3])
|
||||||
|
effect_process.start()
|
||||||
|
except effect.effecterror.EffectException as e:
|
||||||
|
printl("x!@#" + str(sys.argv[2]) + "!@#" + str(sys.argv[3]) + "!@#" + str(e))
|
||||||
|
if sys.argv[1] != 'instagram':
|
||||||
|
effect_process.driver.close()
|
||||||
|
exit(1)
|
||||||
|
except Exception as e:
|
||||||
|
printl("x!@#" + str(sys.argv[2]) + "!@#" + str(sys.argv[3]) + "!@#" + 'e012' + str(e))
|
||||||
|
if sys.argv[1] != 'instagram':
|
||||||
|
effect_process.driver.close()
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
printl("o!@#" + str(sys.argv[2]) + "!@#" + str(sys.argv[3]))
|
||||||
|
if sys.argv[1] != 'instagram':
|
||||||
|
effect_process.driver.close()
|
||||||
|
exit(0)
|
||||||
45
WebBasedCrawler/rankcheck.py
Normal file
45
WebBasedCrawler/rankcheck.py
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
import base.baseclasses
|
||||||
|
import time
|
||||||
|
from selenium.webdriver.common.keys import Keys
|
||||||
|
|
||||||
|
base_url = 'https://search.naver.com/search.naver?where=post&sm=tab_jum&ie=utf8&query='
|
||||||
|
keywords = ['vsl', '유산균']
|
||||||
|
|
||||||
|
|
||||||
|
if '__main__' == __name__:
|
||||||
|
browser = base.baseclasses.Browser()
|
||||||
|
driver = browser.new_firefox_browser()
|
||||||
|
for keyword in keywords:
|
||||||
|
driver.get(base_url + keyword)
|
||||||
|
time.sleep(10)
|
||||||
|
rank = 1
|
||||||
|
with open(keyword + time.strftime("%Y%m%d_%H%M%S") + ".txt", 'w') as f:
|
||||||
|
try:
|
||||||
|
for i in range(1, 101):
|
||||||
|
ul = driver.find_element_by_css_selector("ul[class^='type']")
|
||||||
|
lis = ul.find_elements_by_css_selector("li[class='sh_blog_top']")
|
||||||
|
for li in lis:
|
||||||
|
try:
|
||||||
|
a = li.find_element_by_xpath("div/a")
|
||||||
|
except:
|
||||||
|
a = li.find_element_by_xpath("dl/dt/a")
|
||||||
|
href = a.get_attribute('href')
|
||||||
|
href = href.replace("?Redirect=Log&logNo=", "/")
|
||||||
|
f.write("{0}: {1}\n".format(rank, href))
|
||||||
|
print("{0}: {1}".format(rank, href))
|
||||||
|
f.flush()
|
||||||
|
rank += 1
|
||||||
|
div_paging = driver.find_element_by_css_selector("div[class='paging']")
|
||||||
|
pages = div_paging.find_elements_by_css_selector("*")
|
||||||
|
clickable = False
|
||||||
|
for j in pages:
|
||||||
|
if j.tag_name == "strong":
|
||||||
|
clickable = True
|
||||||
|
elif clickable and j.tag_name == "a":
|
||||||
|
j.send_keys(Keys.NULL)
|
||||||
|
j.send_keys(Keys.ENTER)
|
||||||
|
time.sleep(10)
|
||||||
|
break
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
driver.quit()
|
||||||
40
WebBasedCrawler/rankcheckin.py
Normal file
40
WebBasedCrawler/rankcheckin.py
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
import sys
|
||||||
|
import os.path
|
||||||
|
|
||||||
|
if "__main__" == __name__:
|
||||||
|
if not (len(sys.argv) == 3 or len(sys.argv) == 4):
|
||||||
|
print("Usage : python {0} file1 file2 [ranknum]".format(sys.argv[0]))
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
if not os.path.isfile(sys.argv[1]) or not os.path.isfile(sys.argv[2]):
|
||||||
|
print("check files :\nfile1 : {0}\nfile2 : {1}".format(sys.argv[1], sys.argv[2]))
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
rank1 = []
|
||||||
|
rank2 = []
|
||||||
|
|
||||||
|
if len(sys.argv) == 4 and sys.argv[3].isnumeric():
|
||||||
|
rank = int(sys.argv[3])
|
||||||
|
else:
|
||||||
|
rank = 1000
|
||||||
|
|
||||||
|
with open(sys.argv[1]) as f:
|
||||||
|
for line in f:
|
||||||
|
rank1.append(line[line.index('http'):].replace('\n', ''))
|
||||||
|
|
||||||
|
with open(sys.argv[2]) as f:
|
||||||
|
for line in f:
|
||||||
|
rank2.append(line[line.index('http'):].replace('\n', ''))
|
||||||
|
|
||||||
|
count = 0
|
||||||
|
|
||||||
|
if rank <= len(rank1) and rank <= len(rank2):
|
||||||
|
for url in rank1[:rank]:
|
||||||
|
if url in rank2[:rank]:
|
||||||
|
count += 1
|
||||||
|
else:
|
||||||
|
for url in rank1:
|
||||||
|
if url in rank2:
|
||||||
|
count += 1
|
||||||
|
print(count)
|
||||||
19
column.txt
Normal file
19
column.txt
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
#데이타 베이스명,컬럼 명,Date 검색,Count
|
||||||
|
platform_name,플랫폼 이름,x,x
|
||||||
|
platform_form,플롯폼,x,x
|
||||||
|
article_form,종류,x,x
|
||||||
|
article_id,아이디,x,o
|
||||||
|
article_nickname,닉네임,x,o
|
||||||
|
article_date,날짜,o,o
|
||||||
|
article_title,타이틀,x,x
|
||||||
|
article_data,데이타,x,x
|
||||||
|
platform_id,플랫폼 아이디,x,o
|
||||||
|
platform_title,플랫폼 타이틀,x,x
|
||||||
|
article_url,주소,x,
|
||||||
|
article_parent,상위 댓글 작성자,x,o
|
||||||
|
article_order,댓글 순서,x,x
|
||||||
|
reply_url,다른 주소,x,
|
||||||
|
article_hit,조회수,x,x
|
||||||
|
keyword_id,검색어 번호,x
|
||||||
|
article_profileurl,프로파일 주소,x
|
||||||
|
article_profile,프로파일,x
|
||||||
Reference in New Issue
Block a user