542 lines
18 KiB
Python
542 lines
18 KiB
Python
#-*- coding: utf-8 -*-
|
|
'''
|
|
Created on 2015. 12. 8.
|
|
|
|
@author: cococo
|
|
'''
|
|
import re
|
|
import datetime
|
|
|
|
from base.baseclasses import SendtoDB
|
|
from base.baseclasses import print_and_flush
|
|
from base.baseclasses import CrawlInit
|
|
from base.baseclasses import wait
|
|
from base.baseclasses import find_element_by_xpath
|
|
from base.baseclasses import find_element_by_css_selector
|
|
from base.baseclasses import enter_element
|
|
from base.baseclasses import Browser
|
|
from selenium.webdriver.common.action_chains import ActionChains
|
|
|
|
insta_url = "https://www.instagram.com/"
|
|
insta_tag_url = "https://www.instagram.com/explore/tags/"
|
|
|
|
|
|
class InstaInit(CrawlInit):
|
|
def __init__(self, before_day=0):
|
|
super().__init__(before_day)
|
|
self.urls = dict()
|
|
self.urls[9] = insta_tag_url
|
|
self.urls[10] = insta_url
|
|
|
|
def split_searches(self):
|
|
search = self.searches()
|
|
splited_list = search.split(',')
|
|
trimmed_list = list()
|
|
if self.platform() == 10:
|
|
for x in splited_list:
|
|
trimmed_list.append(x.strip())
|
|
else:
|
|
for x in splited_list:
|
|
trimmed_list.append(self.utf8(x))
|
|
return trimmed_list
|
|
|
|
def make_url(self):
|
|
urls = list()
|
|
for x in self.split_searches():
|
|
url = self.urls[self.platform()] + x
|
|
urls.append(url)
|
|
return urls
|
|
|
|
def get_begin_day(self):
|
|
if self.is_realtime():
|
|
date_now = datetime.datetime.now()
|
|
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
|
|
result += datetime.timedelta(days=self.before_day)
|
|
return result
|
|
else:
|
|
return self.start_day()
|
|
|
|
def get_end_day(self):
|
|
if self.is_realtime():
|
|
date_now = datetime.datetime.now()
|
|
result = datetime.datetime(year=date_now.year, month=date_now.month, day=date_now.day)
|
|
return result
|
|
else:
|
|
return self.end_day()
|
|
|
|
|
|
class InstaBodyCrawler:
|
|
def __init__(self, driver=None):
|
|
self.driver = driver
|
|
self.re_date = re.compile("^([\\d]{4}-[\\d]{2}-[\\d]{2}).([\\d]{2}:[\\d]{2}:[\\d]{2})")
|
|
|
|
def set_driver(self, driver):
|
|
self.driver = driver
|
|
|
|
def set_article(self, article=None):
|
|
if article is None:
|
|
try:
|
|
self.article = find_element_by_xpath(self.driver, '/html/body/div/div/div/div/article', 10)
|
|
except Exception as e:
|
|
print_and_flush(e)
|
|
raise Exception
|
|
else:
|
|
self.article = article
|
|
|
|
def find_article_url(self):
|
|
a = self.article.find_element_by_xpath("div/section/a")
|
|
return a.get_attribute("href")
|
|
|
|
def find_article_profileurl(self):
|
|
img = self.article.find_element_by_xpath("header/a/img[@src]")
|
|
return img.get_attribute("src")
|
|
|
|
def find_article_nickname(self):
|
|
a = self.article.find_element_by_xpath("header/div/a")
|
|
return a.text
|
|
|
|
def find_article_date(self):
|
|
el_time = self.article.find_element_by_xpath("div/section/a/time")
|
|
str_time = el_time.get_attribute("datetime")
|
|
m = self.re_date.search(str_time)
|
|
if m is None:
|
|
return "0000-00-00 00:00:00"
|
|
else:
|
|
return m.group(1) + " " + m.group(2)
|
|
|
|
def find_article_data(self):
|
|
ul = self.article.find_element_by_xpath("div/ul")
|
|
try:
|
|
li = ul.find_element_by_css_selector("li[data-reactid$='.0']")
|
|
span = li.find_element_by_xpath("h1/span")
|
|
return span.text
|
|
except:
|
|
return ""
|
|
|
|
def find_article_id(self):
|
|
return self.find_platform_id()
|
|
|
|
def find_platform_name(self):
|
|
return 'instagram'
|
|
|
|
def find_article_form(self):
|
|
return 'body'
|
|
|
|
def find_platform_id(self):
|
|
a = self.article.find_element_by_xpath("header/div/a")
|
|
if a:
|
|
href = a.get_attribute("href")
|
|
str_id = href.replace(insta_url, "").replace("/", "")
|
|
return str_id
|
|
else:
|
|
return None
|
|
|
|
def find_like_num(self):
|
|
div = self.article.find_element_by_xpath("div/section/div[@data-reactid]")
|
|
try:
|
|
span = div.find_element_by_css_selector("span[data-reactid$='.1'")
|
|
str_num = span.text
|
|
if str_num[-1] == 'm':
|
|
num = float(str_num[0:-1]) * 1000000
|
|
elif str_num[-1] == 'k':
|
|
num = float(str_num[0:-1]) * 1000
|
|
else:
|
|
num = int(str_num)
|
|
return str(num)
|
|
except:
|
|
a_list = div.find_elements_by_tag_name("a")
|
|
if len(a_list) > 1:
|
|
return str(len(a_list))
|
|
else:
|
|
span = div.find_element_by_xpath("span[1]")
|
|
if len(span.text.strip()) < 1:
|
|
return str(1)
|
|
else:
|
|
return str(0)
|
|
|
|
def find_reply_num(self):
|
|
ul = self.article.find_element_by_xpath("div/ul")
|
|
lis = ul.find_elements_by_tag_name("li")
|
|
if len(lis) < 2:
|
|
return "0"
|
|
try:
|
|
li = ul.find_element_by_css_selector("li[data-reactid$='.1']")
|
|
span = li.find_element_by_xpath("button/span[2]")
|
|
str_num = span.text.replace(",", "")
|
|
return str_num
|
|
except:
|
|
return str(len(lis) - 1)
|
|
|
|
def get_content(self):
|
|
content = dict()
|
|
content["article_id"] = self.find_article_id()
|
|
content["platform_id"] = self.find_platform_id()
|
|
content["article_url"] = self.find_article_url()
|
|
content["article_profileurl"] = self.find_article_profileurl()
|
|
content["article_nickname"] = self.find_article_nickname()
|
|
content["platform_name"] = self.find_platform_name()
|
|
content["article_date"] = self.find_article_date()
|
|
content["article_data"] = self.find_article_data()
|
|
content["article_form"] = 'body'
|
|
content["platform_form"] = 'post'
|
|
reply_num = self.find_reply_num()
|
|
if int(reply_num) > 0:
|
|
content["article_order"] = int(reply_num)
|
|
like_num = self.find_like_num()
|
|
if int(float(like_num)) > 0:
|
|
content["article_hit"] = int(float(like_num))
|
|
return content
|
|
|
|
def find_platform_title(self):
|
|
pass
|
|
|
|
def find_article_title(self):
|
|
pass
|
|
|
|
class InstaReplyCrawler:
|
|
def __init__(self, driver=None, article=None):
|
|
self.driver = driver
|
|
self.activity = article
|
|
self.reply_list = list()
|
|
|
|
def find_init(self):
|
|
self.reply_list.clear()
|
|
|
|
def set_driver(self, driver):
|
|
self.driver = driver
|
|
|
|
def set_article(self, article=None):
|
|
if article is None:
|
|
try:
|
|
self.article = find_element_by_xpath(self.driver, '/html/body/div/div/div/div/article', 10)
|
|
except Exception as e:
|
|
print_and_flush(e)
|
|
raise Exception
|
|
else:
|
|
self.article = article
|
|
|
|
def has_more(self, ul):
|
|
try:
|
|
li = ul.find_element_by_css_selector("li[data-reactid$='.1']")
|
|
return True
|
|
except Exception as e:
|
|
return False
|
|
|
|
def read_more_reply(self, ul):
|
|
try:
|
|
button = ul.find_element_by_css_selector("li[data-reactid$='.1']>button")
|
|
enter_element(button)
|
|
except Exception as e:
|
|
print_and_flush(e)
|
|
|
|
def read_all_reply(self, ul):
|
|
for i in range(0, 10):
|
|
if self.has_more(ul):
|
|
self.read_more_reply(ul)
|
|
else:
|
|
break
|
|
|
|
def get_reply_ul(self):
|
|
ul = self.article.find_element_by_xpath("div/ul")
|
|
return ul
|
|
|
|
def has_reply(self, ul):
|
|
try:
|
|
lis = ul.find_elements_by_css_selector("li[data-reactid*='comment']")
|
|
if len(lis) > 0:
|
|
return True
|
|
except:
|
|
return False
|
|
return False
|
|
|
|
def crawl_all(self):
|
|
self.find_init()
|
|
self.set_article()
|
|
try:
|
|
ul = self.get_reply_ul()
|
|
if self.has_reply(ul):
|
|
self.read_all_reply(ul)
|
|
self.crawl_reply(ul)
|
|
except Exception as e:
|
|
print_and_flush(e)
|
|
|
|
def crawl_reply(self, ul):
|
|
article_data = self.find_article_data(ul)
|
|
article_id = self.find_article_id(ul)
|
|
if len(article_data) != len(article_id):
|
|
print_and_flush("article_data != article_id")
|
|
for i in range(0, len(article_id)):
|
|
content = dict()
|
|
content["article_data"] = article_data[i]
|
|
content["article_id"] = article_id[i]
|
|
content["article_nickname"] = article_id[i]
|
|
content["platform_name"] = "instagram"
|
|
content["platform_form"] = "post"
|
|
content["article_form"] = 'reply'
|
|
content["article_order"] = i
|
|
self.reply_list.append(content)
|
|
|
|
def get_content(self):
|
|
return self.reply_list
|
|
|
|
def find_article_id(self, ul):
|
|
id_list = list()
|
|
a_list = ul.find_elements_by_xpath("li/a")
|
|
for i in a_list:
|
|
id_list.append(i.text)
|
|
return id_list
|
|
|
|
def find_article_profileurl(self, ul):
|
|
pass
|
|
|
|
def find_article_nickname(self, ul):
|
|
return self.find_article_id(ul)
|
|
|
|
def find_article_data(self, ul):
|
|
data_list = list()
|
|
span_list = ul.find_elements_by_css_selector("li[data-reactid*='comment']>span")
|
|
for i in span_list:
|
|
data_list.append(i.text)
|
|
return data_list
|
|
|
|
def find_article_url(self, ul):
|
|
pass
|
|
|
|
def find_platform_id(self, ul):
|
|
pass
|
|
|
|
def find_article_form(self, ul=None):
|
|
return 'reply'
|
|
|
|
def find_platform_name(self, ul=None):
|
|
return 'instagram'
|
|
|
|
def find_platform_form(self, ul=None):
|
|
return 'post'
|
|
|
|
def click_element(self, element):
|
|
ac = ActionChains(self.driver)
|
|
ac.move_to_element_with_offset(element, 0, 0).click().perform()
|
|
wait(2)
|
|
|
|
|
|
class InstaPageCrawler:
|
|
def __init__(self, driver=None, begin_date=None, end_date=None):
|
|
self.driver = driver
|
|
self.url_set = set()
|
|
self.begin_date = begin_date
|
|
self.end_date = end_date
|
|
self.re_date = re.compile("^([\\d]{4}-[\\d]{2}-[\\d]{2}).([\\d]{2}:[\\d]{2}:[\\d]{2})")
|
|
|
|
def set_driver(self, driver):
|
|
self.driver = driver
|
|
|
|
def find_article_url(self):
|
|
a = find_element_by_xpath(self.driver, '/html/body/div/div/div/div/article/div/section/a', 60)
|
|
return a.get_attribute("href")
|
|
|
|
def init(self):
|
|
self.url_set.clear()
|
|
|
|
def set_date(self, begin_date, end_date):
|
|
self.set_begin_date(begin_date)
|
|
self.set_end_date(end_date)
|
|
|
|
def set_end_date(self, end_date):
|
|
if type(end_date) == str:
|
|
self.end_date = datetime.datetime.strptime(end_date, '%Y-%m-%d')
|
|
elif type(end_date) == datetime.datetime or type(end_date) == datetime.date:
|
|
self.end_date = end_date
|
|
else:
|
|
self.end_date = datetime.datetime.today()
|
|
self.end_date = datetime.datetime(year=self.end_date.year, month=self.end_date.month, day=self.end_date.day)
|
|
self.end_date += datetime.timedelta(days=1)
|
|
|
|
def set_begin_date(self, begin_date):
|
|
if type(begin_date) == str:
|
|
self.begin_date = datetime.datetime.strptime(begin_date, '%Y-%m-%d')
|
|
elif type(begin_date) == datetime.datetime or type(begin_date) == datetime.date:
|
|
self.begin_date = begin_date
|
|
else:
|
|
self.begin_date = datetime.datetime.today()
|
|
self.begin_date = datetime.datetime(year=self.begin_date.year, month=self.begin_date.month, day=self.begin_date.day)
|
|
|
|
def has_next(self):
|
|
try:
|
|
a = find_element_by_css_selector(self.driver, "a[class$='RightPaginationArrow'", 30)
|
|
return True
|
|
except:
|
|
return False
|
|
|
|
def move_next(self):
|
|
try:
|
|
a = find_element_by_css_selector(self.driver, "a[class$='RightPaginationArrow'", 30)
|
|
enter_element(a)
|
|
return True
|
|
except:
|
|
return False
|
|
|
|
def has_first_page(self):
|
|
try:
|
|
a = find_element_by_css_selector(self.driver, "html>body>span>section>main>article>div[data-reactid*='mostRecentSection']>div>div[data-reactid$='.$0']>a", 60)
|
|
enter_element(a)
|
|
return True
|
|
except:
|
|
return False
|
|
|
|
def crawling_ok(self, url):
|
|
self.url_set.add(url)
|
|
|
|
def is_earlier(self, time_date):
|
|
return True if time_date < self.begin_date else False
|
|
|
|
def is_late(self, time_date):
|
|
return True if time_date > self.end_date else False
|
|
|
|
def find_article_date(self):
|
|
el_time = find_element_by_xpath(self.driver, "/html/body/div/div/div/div/article/div/section/a/time", 60)
|
|
str_time = el_time.get_attribute("datetime")
|
|
m = self.re_date.search(str_time)
|
|
if m is None:
|
|
return "0000-00-00 00:00:00"
|
|
else:
|
|
return m.group(1) + " " + m.group(2)
|
|
|
|
|
|
class InstaMainCrawler:
|
|
def __init__(self):
|
|
self.page_crawler = InstaPageCrawler()
|
|
self.body_crawler = InstaBodyCrawler()
|
|
self.reply_crawler = InstaReplyCrawler()
|
|
self.send_to_db = SendtoDB()
|
|
self.browser = Browser()
|
|
self.crawl_init = InstaInit()
|
|
self.driver = None
|
|
|
|
def set_driver(self, driver):
|
|
self.page_crawler.set_driver(driver)
|
|
self.body_crawler.set_driver(driver)
|
|
self.reply_crawler.set_driver(driver)
|
|
self.driver = driver
|
|
|
|
def set_keyword_id(self, keyword_id):
|
|
self.keyword_id = keyword_id
|
|
|
|
def crawl_all(self, backup_set=None):
|
|
self.page_crawler.init()
|
|
if backup_set:
|
|
self.page_crawler.url_set = backup_set.copy()
|
|
if not self.page_crawler.has_first_page():
|
|
return
|
|
while True:
|
|
str_date = self.page_crawler.find_article_date()
|
|
date_val = datetime.datetime.strptime(str_date, '%Y-%m-%d %H:%M:%S')
|
|
print_and_flush(str_date)
|
|
if self.page_crawler.find_article_url() in self.page_crawler.url_set:
|
|
if self.page_crawler.has_next():
|
|
self.page_crawler.move_next()
|
|
continue
|
|
else:
|
|
break
|
|
#if self.page_crawler.is_earlier(date_val.date()):
|
|
if self.page_crawler.is_late(date_val):
|
|
if self.page_crawler.has_next():
|
|
self.page_crawler.move_next()
|
|
continue
|
|
else:
|
|
break
|
|
#if self.page_crawler.is_late(date_val.date()):
|
|
if self.page_crawler.is_earlier(date_val):
|
|
break
|
|
try:
|
|
body_content = self.crawl_body()
|
|
self.crawl_reply(body_content)
|
|
self.page_crawler.url_set.add(body_content["article_url"])
|
|
print_and_flush("ok")
|
|
except Exception as e:
|
|
print_and_flush('fail')
|
|
print_and_flush(e)
|
|
if self.page_crawler.has_next():
|
|
self.page_crawler.move_next()
|
|
else:
|
|
break
|
|
|
|
def crawl_body(self):
|
|
self.body_crawler.set_driver(self.driver)
|
|
self.body_crawler.set_article()
|
|
content = self.body_crawler.get_content()
|
|
content["keyword_id"] = self.keyword_id
|
|
print_and_flush(content["article_url"])
|
|
self.send_to_db.delete_url(content['article_url'])
|
|
self.send_to_db.send_body(content)
|
|
return content
|
|
|
|
def crawl_reply(self, body_content):
|
|
self.reply_crawler.set_driver(self.driver)
|
|
self.reply_crawler.crawl_all()
|
|
content_list = self.reply_crawler.get_content()
|
|
if content_list:
|
|
for i in content_list:
|
|
i['article_url'] = body_content['article_url']
|
|
i['platform_id'] = body_content['platform_id']
|
|
self.send_to_db.send_reply(content_list)
|
|
|
|
def start(self):
|
|
self.crawler_start()
|
|
|
|
def set_arguments(self, browser, keyword_id, db_num, before_day, until_page):
|
|
self.init_browser(browser)
|
|
self.init_keyword_id(keyword_id)
|
|
self.init_db(db_num)
|
|
self.init_before_day(before_day)
|
|
self.init_until_page(until_page)
|
|
|
|
def init_browser(self, browser):
|
|
self.set_driver(self.browser.get_new_driver(browser))
|
|
|
|
def init_keyword_id(self, keyword_id):
|
|
if type(keyword_id) != int:
|
|
self.keyword_id = int(keyword_id)
|
|
else:
|
|
self.keyword_id = keyword_id
|
|
self.crawl_init.get_keyword_parameters(keyword_id)
|
|
self.crawl_init.disconnect()
|
|
|
|
def init_db(self, db_num):
|
|
self.send_to_db.set_db(db_num)
|
|
|
|
def init_before_day(self, before_day):
|
|
self.crawl_init.set_before_day(before_day)
|
|
|
|
def init_until_page(self, until_page):
|
|
self.crawl_init.set_until_page(until_page)
|
|
|
|
def crawler_start(self):
|
|
real_time = True
|
|
while real_time:
|
|
print_and_flush("Crawling Start")
|
|
url_list = self.crawl_init.make_url()
|
|
i = 0
|
|
backup_set = set()
|
|
while i < len(url_list):
|
|
try:
|
|
print_and_flush(url_list[i] + "\n")
|
|
self.driver.get(url_list[i])
|
|
wait(3)
|
|
self.page_crawler.set_date(begin_date=self.crawl_init.get_begin_day(),
|
|
end_date=self.crawl_init.get_end_day())
|
|
if self.page_crawler.has_first_page():
|
|
self.crawl_all(backup_set)
|
|
i += 1
|
|
backup_set.clear()
|
|
except Exception as e:
|
|
print_and_flush(e)
|
|
backup_set = self.page_crawler.url_set.copy()
|
|
self.driver.quit()
|
|
self.set_driver(self.browser.new_browser())
|
|
wait(5)
|
|
real_time = self.crawl_init.is_realtime()
|
|
print_and_flush("Finished Crawling :)")
|
|
self.send_to_db.close()
|
|
self.driver.quit()
|