Files
clients/WebBasedCrawler/insta/instatest.py
admin cff46799eb instagram 멀티로 실행하게 만들기
git-svn-id: svn://192.168.0.12/source@287 8346c931-da38-4b9b-9d4c-e48b93cbd075
2016-08-18 07:44:21 +00:00

82 lines
2.2 KiB
Python

from base.baseclasses import SendtoDB
from base.baseclasses import print_and_flush
from base.baseclasses import CrawlInit
from base.baseclasses import wait
from base.baseclasses import Browser
from base.baseclasses import enter_element
from selenium.webdriver.common.keys import Keys
def pageup_and_pagedown(_driver):
# body = _driver.find_element_by_tag_name('body')
# for i in range(0, 2):
# body.send_keys(Keys.PAGE_UP)
# wait(3)
# for i in range(0, 5):
# body.send_keys(Keys.PAGE_DOWN)
# wait(3)
for i in range(0, 3):
_driver.execute_script("window.scrollBy(0, -300)")
wait(0.4)
for i in range(0, 5):
_driver.execute_script("window.scrollBy(0, 800)")
wait(0.4)
def first_load(_driver):
element = _driver.find_element_by_css_selector("div._pupj3 > a")
enter_element(element)
def get_urls(_driver, url_set):
elements = _driver.find_elements_by_css_selector("div._myci9>a")
for element in elements:
url_set.add(element.get_attribute('href'))
def remove_myci9(_driver):
elements = _driver.find_elements_by_css_selector("div._myci9")
for i in range(0, len(elements) - 4 if len(elements) - 4 > 0 else 0):
_driver.execute_script("""
var element = document.querySelector("div._myci9");
if (element)
element.parentNode.removeChild(element);
""")
browser = Browser()
driver = browser.get_new_driver('ie')
url_sets = set()
wait(5)
url = "https://www.instagram.com/explore/tags/%EC%A4%8C%EB%A7%88%EA%B7%B8%EB%9E%A8/"
#url = 'https://www.instagram.com/explore/tags/%EB%A7%9B%EC%8A%A4%ED%83%80%EA%B7%B8%EB%9E%A8/'
driver.get(url)
#driver.get('https://www.instagram.com/explore/tags/맛스타그램/')
wait(5)
first_load(driver)
wait(3)
#print(driver.get_cookies())
with open("c:\\data\\instajummaie.txt", 'w') as f:
try:
while True:
for j in range(0, 10):
pageup_and_pagedown(driver)
get_urls(driver, url_sets)
remove_myci9(driver)
print("url count = {0}\n".format(len(url_sets)), flush=True, file=f)
finally:
print("finished")