인스타그램 크롤러 수정
sfilterprocess 잡다버그 디버깅 git-svn-id: svn://192.168.0.12/source@277 8346c931-da38-4b9b-9d4c-e48b93cbd075
This commit is contained in:
@@ -107,8 +107,8 @@ class InstaBodyCrawler:
|
||||
def find_article_data(self):
|
||||
ul = self.article.find_element_by_xpath("div/ul")
|
||||
try:
|
||||
li = ul.find_element_by_css_selector("li[data-reactid$='.0']")
|
||||
span = li.find_element_by_xpath("h1/span")
|
||||
#li = ul.find_element_by_css_selector("li[data-reactid$='.0']")
|
||||
span = ul.find_element_by_css_selector("li h1>span")
|
||||
return span.text
|
||||
except:
|
||||
return ""
|
||||
@@ -132,27 +132,32 @@ class InstaBodyCrawler:
|
||||
return None
|
||||
|
||||
def find_like_num(self):
|
||||
div = self.article.find_element_by_xpath("div/section/div[@data-reactid]")
|
||||
div = self.article.find_element_by_xpath("div/section[1]/div")
|
||||
try:
|
||||
span = div.find_element_by_css_selector("span[data-reactid$='.1'")
|
||||
span = div.find_element_by_xpath("span/span")
|
||||
str_num = span.text
|
||||
str_num = str_num.replace(',', '')
|
||||
if str_num[-1] == 'm':
|
||||
num = float(str_num[0:-1]) * 1000000
|
||||
elif str_num[-1] == 'k':
|
||||
num = float(str_num[0:-1]) * 1000
|
||||
else:
|
||||
num = int(str_num)
|
||||
num = int(str_num)
|
||||
return str(num)
|
||||
except:
|
||||
a_list = div.find_elements_by_tag_name("a")
|
||||
if len(a_list) > 1:
|
||||
return str(len(a_list))
|
||||
else:
|
||||
span = div.find_element_by_xpath("span[1]")
|
||||
if len(span.text.strip()) < 1:
|
||||
if a_list and a_list[0].get_attribute('title'):
|
||||
return str(1)
|
||||
else:
|
||||
return str(0)
|
||||
# span = div.find_element_by_xpath("span[1]")
|
||||
# if len(span.text.strip()) < 1:
|
||||
# return str(1)
|
||||
# else:
|
||||
# return str(0)
|
||||
|
||||
def find_reply_num(self):
|
||||
ul = self.article.find_element_by_xpath("div/ul")
|
||||
@@ -179,12 +184,13 @@ class InstaBodyCrawler:
|
||||
content["article_data"] = self.find_article_data()
|
||||
content["article_form"] = 'body'
|
||||
content["platform_form"] = 'post'
|
||||
content["platform_title"] = content["article_id"]
|
||||
reply_num = self.find_reply_num()
|
||||
if int(reply_num) > 0:
|
||||
content["article_order"] = int(reply_num)
|
||||
like_num = self.find_like_num()
|
||||
if int(float(like_num)) > 0:
|
||||
content["article_hit"] = int(float(like_num))
|
||||
content["reply_url"] = int(float(like_num))
|
||||
return content
|
||||
|
||||
def find_platform_title(self):
|
||||
@@ -193,6 +199,7 @@ class InstaBodyCrawler:
|
||||
def find_article_title(self):
|
||||
pass
|
||||
|
||||
|
||||
class InstaReplyCrawler:
|
||||
def __init__(self, driver=None, article=None):
|
||||
self.driver = driver
|
||||
@@ -217,24 +224,28 @@ class InstaReplyCrawler:
|
||||
|
||||
def has_more(self, ul):
|
||||
try:
|
||||
li = ul.find_element_by_css_selector("li[data-reactid$='.1']")
|
||||
button = ul.find_element_by_css_selector("li>button")
|
||||
return True
|
||||
except Exception as e:
|
||||
return False
|
||||
|
||||
def read_more_reply(self, ul):
|
||||
try:
|
||||
button = ul.find_element_by_css_selector("li[data-reactid$='.1']>button")
|
||||
button = ul.find_element_by_css_selector("li>button")
|
||||
enter_element(button)
|
||||
except Exception as e:
|
||||
print_and_flush(e)
|
||||
|
||||
def read_all_reply(self, ul):
|
||||
for i in range(0, 10):
|
||||
if self.has_more(ul):
|
||||
self.read_more_reply(ul)
|
||||
else:
|
||||
break
|
||||
i = 0
|
||||
while i < 200 and self.has_more(ul):
|
||||
self.read_more_reply(ul)
|
||||
i += 1
|
||||
# for i in range(0, 10):
|
||||
# if self.has_more(ul):
|
||||
# self.read_more_reply(ul)
|
||||
# else:
|
||||
# break
|
||||
|
||||
def get_reply_ul(self):
|
||||
ul = self.article.find_element_by_xpath("div/ul")
|
||||
@@ -242,7 +253,7 @@ class InstaReplyCrawler:
|
||||
|
||||
def has_reply(self, ul):
|
||||
try:
|
||||
lis = ul.find_elements_by_css_selector("li[data-reactid*='comment']")
|
||||
lis = ul.find_elements_by_css_selector("li>a")
|
||||
if len(lis) > 0:
|
||||
return True
|
||||
except:
|
||||
@@ -294,7 +305,7 @@ class InstaReplyCrawler:
|
||||
|
||||
def find_article_data(self, ul):
|
||||
data_list = list()
|
||||
span_list = ul.find_elements_by_css_selector("li[data-reactid*='comment']>span")
|
||||
span_list = ul.find_elements_by_css_selector("li>span")
|
||||
for i in span_list:
|
||||
data_list.append(i.text)
|
||||
return data_list
|
||||
@@ -378,7 +389,9 @@ class InstaPageCrawler:
|
||||
|
||||
def has_first_page(self):
|
||||
try:
|
||||
a = find_element_by_css_selector(self.driver, "html>body>span>section>main>article>div[data-reactid*='mostRecentSection']>div>div[data-reactid$='.$0']>a", 60)
|
||||
#a = find_element_by_css_selector(self.driver, "html>body>span>section>main>article>div[data-reactid*='mostRecentSection']>div>div[data-reactid$='.$0']>a", 60)
|
||||
#a = find_element_by_css_selector(self.driver, "a[class='_8mlbc _vbtk2 _t5r8b']")
|
||||
a = find_element_by_css_selector(self.driver, "div[class='_8fxp6'] a[class='_8mlbc _vbtk2 _t5r8b']")
|
||||
enter_element(a)
|
||||
return True
|
||||
except:
|
||||
@@ -538,4 +551,4 @@ class InstaMainCrawler:
|
||||
real_time = self.crawl_init.is_realtime()
|
||||
print_and_flush("Finished Crawling :)")
|
||||
self.send_to_db.close()
|
||||
self.driver.quit()
|
||||
#self.driver.quit()
|
||||
|
||||
@@ -115,13 +115,15 @@ class KakaoBodyCrawler:
|
||||
return "body"
|
||||
|
||||
def find_article_data(self):
|
||||
more = self.activity.find_element_by_xpath("div/div[@class='fd_cont _contentWrapper']/p[@class='more _moreBtnContainer']")
|
||||
more = self.activity.find_element_by_xpath("div/div[@class='fd_cont _contentWrapper']"
|
||||
"/p[@class='more _moreBtnContainer']")
|
||||
display = more.get_attribute("style")
|
||||
if display.find('none') == -1:
|
||||
a = more.find_element_by_tag_name("a")
|
||||
self.enter_element(a)
|
||||
try:
|
||||
content = self.activity.find_element_by_xpath("div/div[@class='fd_cont _contentWrapper']/div[@class='txt_wrap _content']")
|
||||
content = self.activity.find_element_by_xpath("div/div[@class='fd_cont _contentWrapper']"
|
||||
"/div[@class='txt_wrap']/div[@class='_content']")
|
||||
except:
|
||||
return str("")
|
||||
return content.text
|
||||
|
||||
Reference in New Issue
Block a user