82 lines
2.2 KiB
Python
82 lines
2.2 KiB
Python
from base.baseclasses import SendtoDB
|
|
from base.baseclasses import print_and_flush
|
|
from base.baseclasses import CrawlInit
|
|
from base.baseclasses import wait
|
|
from base.baseclasses import Browser
|
|
from base.baseclasses import enter_element
|
|
from selenium.webdriver.common.keys import Keys
|
|
|
|
|
|
def pageup_and_pagedown(_driver):
|
|
# body = _driver.find_element_by_tag_name('body')
|
|
# for i in range(0, 2):
|
|
# body.send_keys(Keys.PAGE_UP)
|
|
# wait(3)
|
|
# for i in range(0, 5):
|
|
# body.send_keys(Keys.PAGE_DOWN)
|
|
# wait(3)
|
|
for i in range(0, 3):
|
|
_driver.execute_script("window.scrollBy(0, -300)")
|
|
wait(0.4)
|
|
|
|
for i in range(0, 5):
|
|
_driver.execute_script("window.scrollBy(0, 800)")
|
|
wait(0.4)
|
|
|
|
def first_load(_driver):
|
|
element = _driver.find_element_by_css_selector("div._pupj3 > a")
|
|
enter_element(element)
|
|
|
|
|
|
def get_urls(_driver, url_set):
|
|
elements = _driver.find_elements_by_css_selector("div._myci9>a")
|
|
for element in elements:
|
|
url_set.add(element.get_attribute('href'))
|
|
|
|
|
|
def remove_myci9(_driver):
|
|
elements = _driver.find_elements_by_css_selector("div._myci9")
|
|
for i in range(0, len(elements) - 4 if len(elements) - 4 > 0 else 0):
|
|
_driver.execute_script("""
|
|
var element = document.querySelector("div._myci9");
|
|
if (element)
|
|
element.parentNode.removeChild(element);
|
|
""")
|
|
|
|
|
|
browser = Browser()
|
|
driver = browser.get_new_driver('ie')
|
|
|
|
url_sets = set()
|
|
wait(5)
|
|
url = "https://www.instagram.com/explore/tags/%EC%A4%8C%EB%A7%88%EA%B7%B8%EB%9E%A8/"
|
|
#url = 'https://www.instagram.com/explore/tags/%EB%A7%9B%EC%8A%A4%ED%83%80%EA%B7%B8%EB%9E%A8/'
|
|
driver.get(url)
|
|
#driver.get('https://www.instagram.com/explore/tags/맛스타그램/')
|
|
wait(5)
|
|
|
|
|
|
first_load(driver)
|
|
wait(3)
|
|
|
|
#print(driver.get_cookies())
|
|
|
|
with open("c:\\data\\instajummaie.txt", 'w') as f:
|
|
try:
|
|
while True:
|
|
for j in range(0, 10):
|
|
pageup_and_pagedown(driver)
|
|
|
|
get_urls(driver, url_sets)
|
|
remove_myci9(driver)
|
|
print("url count = {0}\n".format(len(url_sets)), flush=True, file=f)
|
|
finally:
|
|
print("finished")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|