git-svn-id: svn://192.168.0.12/source@343 8346c931-da38-4b9b-9d4c-e48b93cbd075

This commit is contained in:
admin
2017-01-19 10:30:58 +00:00
parent 53d5da70de
commit 8ec8a773ad
6 changed files with 214 additions and 0 deletions

View File

@@ -0,0 +1,27 @@
# This file configures which browser you use when crawling
# Platforms are instagram, kakaostory, navercafe
# You can use these options: chrome, firefox, opera, ie
# If you have installed chromedriver and Chrome in the same folder
#platform=chrome
# If you use windows and have installed IEDriverServer.exe
#platform=ie
# If you have installed Firefox
#platform=firefox
# If you have installed OperaBrowser and operadriver
#platform=opera
# You can also specify a browser option about each the platform
# If browser option is empty in the each platform, the crawler will reference default
# If browser.txt file is empty or not configured,
# ie is default in Windows. firefox is default in Linux
default=chrome
kakaostory=chrome
#instagram=firefox
navercafe=firefox
#facebook=chrome

View File

@@ -0,0 +1,5 @@
[database]
user=root
pass=1234
host=192.168.0.82
name=bigbird

View File

@@ -0,0 +1,78 @@
import effect.effectinstagram
import effect.effecterror
import effect.effectkakaostory
from base.baseclasses import printl
import sys
import base.baseclasses
browser_opt = ('chrome', "ie", "opera", "firefox")
platform_opt = ('instagram', 'kakaostory', 'navercafe', "facebook")
def get_browser_info(platform_, file_name="browser.txt"):
if sys.platform == 'win32':
options = {'default': 'ie'}
else:
options = {'default': 'firefox'}
try:
with open(file_name, 'r') as f:
for line in f:
if line.startswith("#"):
continue
elif len(line.strip()) < 1:
continue
else:
platform, browser = line.split("=")
platform = platform.strip()
browser = browser.strip()
if (platform not in options.keys() and platform not in platform_opt) or browser not in browser_opt:
pass
else:
options[platform] = browser
finally:
return options.get(platform_, options['default'])
def get_effect_process(platform_, event_num, url):
if platform_ == 'instagram':
return effect.effectinstagram.EffectInsta(int(event_num), int(event_num), url)
else:
browser_info = get_browser_info(platform_)
browser = base.baseclasses.Browser()
driver = browser.get_new_driver(browser_info)
if platform_ == 'kakaostory':
return effect.effectkakaostory.EffectKakaostory(int(event_num), int(event_num), url, driver)
else:
return None
if __name__ == '__main__':
"""
sys.argv[0] effectprocess.py
sys.argv[1] instagram, kakaostory, facebook
sys.argv[2] event_num
sys.argv[3] url
"""
if len(sys.argv) != 4:
printl("x!@#!@#!@#e010!@#check argument")
exit(1)
try:
effect_process = get_effect_process(sys.argv[1], sys.argv[2], sys.argv[3])
effect_process.start()
except effect.effecterror.EffectException as e:
printl("x!@#" + str(sys.argv[2]) + "!@#" + str(sys.argv[3]) + "!@#" + str(e))
if sys.argv[1] != 'instagram':
effect_process.driver.close()
exit(1)
except Exception as e:
printl("x!@#" + str(sys.argv[2]) + "!@#" + str(sys.argv[3]) + "!@#" + 'e012' + str(e))
if sys.argv[1] != 'instagram':
effect_process.driver.close()
exit(1)
printl("o!@#" + str(sys.argv[2]) + "!@#" + str(sys.argv[3]))
if sys.argv[1] != 'instagram':
effect_process.driver.close()
exit(0)

View File

@@ -0,0 +1,45 @@
import base.baseclasses
import time
from selenium.webdriver.common.keys import Keys
base_url = 'https://search.naver.com/search.naver?where=post&sm=tab_jum&ie=utf8&query='
keywords = ['vsl', '유산균']
if '__main__' == __name__:
browser = base.baseclasses.Browser()
driver = browser.new_firefox_browser()
for keyword in keywords:
driver.get(base_url + keyword)
time.sleep(10)
rank = 1
with open(keyword + time.strftime("%Y%m%d_%H%M%S") + ".txt", 'w') as f:
try:
for i in range(1, 101):
ul = driver.find_element_by_css_selector("ul[class^='type']")
lis = ul.find_elements_by_css_selector("li[class='sh_blog_top']")
for li in lis:
try:
a = li.find_element_by_xpath("div/a")
except:
a = li.find_element_by_xpath("dl/dt/a")
href = a.get_attribute('href')
href = href.replace("?Redirect=Log&logNo=", "/")
f.write("{0}: {1}\n".format(rank, href))
print("{0}: {1}".format(rank, href))
f.flush()
rank += 1
div_paging = driver.find_element_by_css_selector("div[class='paging']")
pages = div_paging.find_elements_by_css_selector("*")
clickable = False
for j in pages:
if j.tag_name == "strong":
clickable = True
elif clickable and j.tag_name == "a":
j.send_keys(Keys.NULL)
j.send_keys(Keys.ENTER)
time.sleep(10)
break
except:
pass
driver.quit()

View File

@@ -0,0 +1,40 @@
#!/usr/bin/python3
import sys
import os.path
if "__main__" == __name__:
if not (len(sys.argv) == 3 or len(sys.argv) == 4):
print("Usage : python {0} file1 file2 [ranknum]".format(sys.argv[0]))
exit(1)
if not os.path.isfile(sys.argv[1]) or not os.path.isfile(sys.argv[2]):
print("check files :\nfile1 : {0}\nfile2 : {1}".format(sys.argv[1], sys.argv[2]))
exit(1)
rank1 = []
rank2 = []
if len(sys.argv) == 4 and sys.argv[3].isnumeric():
rank = int(sys.argv[3])
else:
rank = 1000
with open(sys.argv[1]) as f:
for line in f:
rank1.append(line[line.index('http'):].replace('\n', ''))
with open(sys.argv[2]) as f:
for line in f:
rank2.append(line[line.index('http'):].replace('\n', ''))
count = 0
if rank <= len(rank1) and rank <= len(rank2):
for url in rank1[:rank]:
if url in rank2[:rank]:
count += 1
else:
for url in rank1:
if url in rank2:
count += 1
print(count)

19
column.txt Normal file
View File

@@ -0,0 +1,19 @@
#데이타 베이스명,컬럼 명,Date 검색,Count
platform_name,플랫폼 이름,x,x
platform_form,플롯폼,x,x
article_form,종류,x,x
article_id,아이디,x,o
article_nickname,닉네임,x,o
article_date,날짜,o,o
article_title,타이틀,x,x
article_data,데이타,x,x
platform_id,플랫폼 아이디,x,o
platform_title,플랫폼 타이틀,x,x
article_url,주소,x,
article_parent,상위 댓글 작성자,x,o
article_order,댓글 순서,x,x
reply_url,다른 주소,x,
article_hit,조회수,x,x
keyword_id,검색어 번호,x
article_profileurl,프로파일 주소,x
article_profile,프로파일,x