인스타그램 크롤러 수정
sfilterprocess 잡다버그 디버깅 git-svn-id: svn://192.168.0.12/source@277 8346c931-da38-4b9b-9d4c-e48b93cbd075
This commit is contained in:
@@ -107,8 +107,8 @@ class InstaBodyCrawler:
|
|||||||
def find_article_data(self):
|
def find_article_data(self):
|
||||||
ul = self.article.find_element_by_xpath("div/ul")
|
ul = self.article.find_element_by_xpath("div/ul")
|
||||||
try:
|
try:
|
||||||
li = ul.find_element_by_css_selector("li[data-reactid$='.0']")
|
#li = ul.find_element_by_css_selector("li[data-reactid$='.0']")
|
||||||
span = li.find_element_by_xpath("h1/span")
|
span = ul.find_element_by_css_selector("li h1>span")
|
||||||
return span.text
|
return span.text
|
||||||
except:
|
except:
|
||||||
return ""
|
return ""
|
||||||
@@ -132,27 +132,32 @@ class InstaBodyCrawler:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
def find_like_num(self):
|
def find_like_num(self):
|
||||||
div = self.article.find_element_by_xpath("div/section/div[@data-reactid]")
|
div = self.article.find_element_by_xpath("div/section[1]/div")
|
||||||
try:
|
try:
|
||||||
span = div.find_element_by_css_selector("span[data-reactid$='.1'")
|
span = div.find_element_by_xpath("span/span")
|
||||||
str_num = span.text
|
str_num = span.text
|
||||||
|
str_num = str_num.replace(',', '')
|
||||||
if str_num[-1] == 'm':
|
if str_num[-1] == 'm':
|
||||||
num = float(str_num[0:-1]) * 1000000
|
num = float(str_num[0:-1]) * 1000000
|
||||||
elif str_num[-1] == 'k':
|
elif str_num[-1] == 'k':
|
||||||
num = float(str_num[0:-1]) * 1000
|
num = float(str_num[0:-1]) * 1000
|
||||||
else:
|
else:
|
||||||
num = int(str_num)
|
num = int(str_num)
|
||||||
return str(num)
|
return str(num)
|
||||||
except:
|
except:
|
||||||
a_list = div.find_elements_by_tag_name("a")
|
a_list = div.find_elements_by_tag_name("a")
|
||||||
if len(a_list) > 1:
|
if len(a_list) > 1:
|
||||||
return str(len(a_list))
|
return str(len(a_list))
|
||||||
else:
|
else:
|
||||||
span = div.find_element_by_xpath("span[1]")
|
if a_list and a_list[0].get_attribute('title'):
|
||||||
if len(span.text.strip()) < 1:
|
|
||||||
return str(1)
|
return str(1)
|
||||||
else:
|
else:
|
||||||
return str(0)
|
return str(0)
|
||||||
|
# span = div.find_element_by_xpath("span[1]")
|
||||||
|
# if len(span.text.strip()) < 1:
|
||||||
|
# return str(1)
|
||||||
|
# else:
|
||||||
|
# return str(0)
|
||||||
|
|
||||||
def find_reply_num(self):
|
def find_reply_num(self):
|
||||||
ul = self.article.find_element_by_xpath("div/ul")
|
ul = self.article.find_element_by_xpath("div/ul")
|
||||||
@@ -179,12 +184,13 @@ class InstaBodyCrawler:
|
|||||||
content["article_data"] = self.find_article_data()
|
content["article_data"] = self.find_article_data()
|
||||||
content["article_form"] = 'body'
|
content["article_form"] = 'body'
|
||||||
content["platform_form"] = 'post'
|
content["platform_form"] = 'post'
|
||||||
|
content["platform_title"] = content["article_id"]
|
||||||
reply_num = self.find_reply_num()
|
reply_num = self.find_reply_num()
|
||||||
if int(reply_num) > 0:
|
if int(reply_num) > 0:
|
||||||
content["article_order"] = int(reply_num)
|
content["article_order"] = int(reply_num)
|
||||||
like_num = self.find_like_num()
|
like_num = self.find_like_num()
|
||||||
if int(float(like_num)) > 0:
|
if int(float(like_num)) > 0:
|
||||||
content["article_hit"] = int(float(like_num))
|
content["reply_url"] = int(float(like_num))
|
||||||
return content
|
return content
|
||||||
|
|
||||||
def find_platform_title(self):
|
def find_platform_title(self):
|
||||||
@@ -193,6 +199,7 @@ class InstaBodyCrawler:
|
|||||||
def find_article_title(self):
|
def find_article_title(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class InstaReplyCrawler:
|
class InstaReplyCrawler:
|
||||||
def __init__(self, driver=None, article=None):
|
def __init__(self, driver=None, article=None):
|
||||||
self.driver = driver
|
self.driver = driver
|
||||||
@@ -217,24 +224,28 @@ class InstaReplyCrawler:
|
|||||||
|
|
||||||
def has_more(self, ul):
|
def has_more(self, ul):
|
||||||
try:
|
try:
|
||||||
li = ul.find_element_by_css_selector("li[data-reactid$='.1']")
|
button = ul.find_element_by_css_selector("li>button")
|
||||||
return True
|
return True
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def read_more_reply(self, ul):
|
def read_more_reply(self, ul):
|
||||||
try:
|
try:
|
||||||
button = ul.find_element_by_css_selector("li[data-reactid$='.1']>button")
|
button = ul.find_element_by_css_selector("li>button")
|
||||||
enter_element(button)
|
enter_element(button)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print_and_flush(e)
|
print_and_flush(e)
|
||||||
|
|
||||||
def read_all_reply(self, ul):
|
def read_all_reply(self, ul):
|
||||||
for i in range(0, 10):
|
i = 0
|
||||||
if self.has_more(ul):
|
while i < 200 and self.has_more(ul):
|
||||||
self.read_more_reply(ul)
|
self.read_more_reply(ul)
|
||||||
else:
|
i += 1
|
||||||
break
|
# for i in range(0, 10):
|
||||||
|
# if self.has_more(ul):
|
||||||
|
# self.read_more_reply(ul)
|
||||||
|
# else:
|
||||||
|
# break
|
||||||
|
|
||||||
def get_reply_ul(self):
|
def get_reply_ul(self):
|
||||||
ul = self.article.find_element_by_xpath("div/ul")
|
ul = self.article.find_element_by_xpath("div/ul")
|
||||||
@@ -242,7 +253,7 @@ class InstaReplyCrawler:
|
|||||||
|
|
||||||
def has_reply(self, ul):
|
def has_reply(self, ul):
|
||||||
try:
|
try:
|
||||||
lis = ul.find_elements_by_css_selector("li[data-reactid*='comment']")
|
lis = ul.find_elements_by_css_selector("li>a")
|
||||||
if len(lis) > 0:
|
if len(lis) > 0:
|
||||||
return True
|
return True
|
||||||
except:
|
except:
|
||||||
@@ -294,7 +305,7 @@ class InstaReplyCrawler:
|
|||||||
|
|
||||||
def find_article_data(self, ul):
|
def find_article_data(self, ul):
|
||||||
data_list = list()
|
data_list = list()
|
||||||
span_list = ul.find_elements_by_css_selector("li[data-reactid*='comment']>span")
|
span_list = ul.find_elements_by_css_selector("li>span")
|
||||||
for i in span_list:
|
for i in span_list:
|
||||||
data_list.append(i.text)
|
data_list.append(i.text)
|
||||||
return data_list
|
return data_list
|
||||||
@@ -378,7 +389,9 @@ class InstaPageCrawler:
|
|||||||
|
|
||||||
def has_first_page(self):
|
def has_first_page(self):
|
||||||
try:
|
try:
|
||||||
a = find_element_by_css_selector(self.driver, "html>body>span>section>main>article>div[data-reactid*='mostRecentSection']>div>div[data-reactid$='.$0']>a", 60)
|
#a = find_element_by_css_selector(self.driver, "html>body>span>section>main>article>div[data-reactid*='mostRecentSection']>div>div[data-reactid$='.$0']>a", 60)
|
||||||
|
#a = find_element_by_css_selector(self.driver, "a[class='_8mlbc _vbtk2 _t5r8b']")
|
||||||
|
a = find_element_by_css_selector(self.driver, "div[class='_8fxp6'] a[class='_8mlbc _vbtk2 _t5r8b']")
|
||||||
enter_element(a)
|
enter_element(a)
|
||||||
return True
|
return True
|
||||||
except:
|
except:
|
||||||
@@ -538,4 +551,4 @@ class InstaMainCrawler:
|
|||||||
real_time = self.crawl_init.is_realtime()
|
real_time = self.crawl_init.is_realtime()
|
||||||
print_and_flush("Finished Crawling :)")
|
print_and_flush("Finished Crawling :)")
|
||||||
self.send_to_db.close()
|
self.send_to_db.close()
|
||||||
self.driver.quit()
|
#self.driver.quit()
|
||||||
|
|||||||
@@ -115,13 +115,15 @@ class KakaoBodyCrawler:
|
|||||||
return "body"
|
return "body"
|
||||||
|
|
||||||
def find_article_data(self):
|
def find_article_data(self):
|
||||||
more = self.activity.find_element_by_xpath("div/div[@class='fd_cont _contentWrapper']/p[@class='more _moreBtnContainer']")
|
more = self.activity.find_element_by_xpath("div/div[@class='fd_cont _contentWrapper']"
|
||||||
|
"/p[@class='more _moreBtnContainer']")
|
||||||
display = more.get_attribute("style")
|
display = more.get_attribute("style")
|
||||||
if display.find('none') == -1:
|
if display.find('none') == -1:
|
||||||
a = more.find_element_by_tag_name("a")
|
a = more.find_element_by_tag_name("a")
|
||||||
self.enter_element(a)
|
self.enter_element(a)
|
||||||
try:
|
try:
|
||||||
content = self.activity.find_element_by_xpath("div/div[@class='fd_cont _contentWrapper']/div[@class='txt_wrap _content']")
|
content = self.activity.find_element_by_xpath("div/div[@class='fd_cont _contentWrapper']"
|
||||||
|
"/div[@class='txt_wrap']/div[@class='_content']")
|
||||||
except:
|
except:
|
||||||
return str("")
|
return str("")
|
||||||
return content.text
|
return content.text
|
||||||
|
|||||||
@@ -139,7 +139,10 @@ bool SConsumer::makeOverallCategory(int _nCategory)
|
|||||||
for (QMap<QString, int>::iterator iterPos1 = val.m_mapRealReplyUniqueUrl.begin(); iterPos1 != val.m_mapRealReplyUniqueUrl.end();
|
for (QMap<QString, int>::iterator iterPos1 = val.m_mapRealReplyUniqueUrl.begin(); iterPos1 != val.m_mapRealReplyUniqueUrl.end();
|
||||||
iterPos1++)
|
iterPos1++)
|
||||||
{
|
{
|
||||||
streal.m_mapRealReplyUniqueUrl.insertMulti(iterPos1.key(), iterPos1.value());
|
if (streal.m_mapRealReplyUniqueUrl.contains(iterPos1.key()) && streal.m_mapRealReplyUniqueUrl.values(iterPos1.key()).contains(iterPos1.value()))
|
||||||
|
continue;
|
||||||
|
else
|
||||||
|
streal.m_mapRealReplyUniqueUrl.insertMulti(iterPos1.key(), iterPos1.value());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -346,6 +346,7 @@ bool SDatagroup::makeTable()
|
|||||||
stNickname stnickname;
|
stNickname stnickname;
|
||||||
stnickname.nickname = strListReply[anColumn[SInitializer::E_DATA_article_nickname]];
|
stnickname.nickname = strListReply[anColumn[SInitializer::E_DATA_article_nickname]];
|
||||||
stnickname.num = nNickname++;
|
stnickname.num = nNickname++;
|
||||||
|
stnickname.platformname_num = m_pSInitializer->getPlatformName(strListReply[anColumn[SInitializer::E_DATA_platform_name]].trimmed());
|
||||||
m_SNickname.put(strListReply[anColumn[SInitializer::E_DATA_article_nickname]].trimmed(), stnickname);
|
m_SNickname.put(strListReply[anColumn[SInitializer::E_DATA_article_nickname]].trimmed(), stnickname);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
<!DOCTYPE QtCreatorProject>
|
<!DOCTYPE QtCreatorProject>
|
||||||
<!-- Written by QtCreator 3.3.0, 2016-06-07T18:39:42. -->
|
<!-- Written by QtCreator 3.3.0, 2016-06-10T19:06:00. -->
|
||||||
<qtcreator>
|
<qtcreator>
|
||||||
<data>
|
<data>
|
||||||
<variable>EnvironmentId</variable>
|
<variable>EnvironmentId</variable>
|
||||||
@@ -787,7 +787,7 @@
|
|||||||
<value type="QString" key="ProjectExplorer.ProjectConfiguration.DefaultDisplayName">sfilterprocess</value>
|
<value type="QString" key="ProjectExplorer.ProjectConfiguration.DefaultDisplayName">sfilterprocess</value>
|
||||||
<value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName"></value>
|
<value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName"></value>
|
||||||
<value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">Qt4ProjectManager.Qt4RunConfiguration:C:/source/sfilterprocess/sfilterprocess.pro</value>
|
<value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">Qt4ProjectManager.Qt4RunConfiguration:C:/source/sfilterprocess/sfilterprocess.pro</value>
|
||||||
<value type="QString" key="Qt4ProjectManager.Qt4RunConfiguration.CommandLineArguments">"470" "test2y"</value>
|
<value type="QString" key="Qt4ProjectManager.Qt4RunConfiguration.CommandLineArguments">"469" "test2y"</value>
|
||||||
<value type="QString" key="Qt4ProjectManager.Qt4RunConfiguration.ProFile">sfilterprocess.pro</value>
|
<value type="QString" key="Qt4ProjectManager.Qt4RunConfiguration.ProFile">sfilterprocess.pro</value>
|
||||||
<value type="bool" key="Qt4ProjectManager.Qt4RunConfiguration.UseDyldImageSuffix">false</value>
|
<value type="bool" key="Qt4ProjectManager.Qt4RunConfiguration.UseDyldImageSuffix">false</value>
|
||||||
<value type="bool" key="Qt4ProjectManager.Qt4RunConfiguration.UseTerminal">true</value>
|
<value type="bool" key="Qt4ProjectManager.Qt4RunConfiguration.UseTerminal">true</value>
|
||||||
|
|||||||
@@ -156,7 +156,10 @@ bool SInfluencer::makeOverallCategory(int _categoryNum)
|
|||||||
for (QMap<QString, int>::iterator iterPos1 = val.m_mapRealReplyUniqueUrl.begin(); iterPos1 != val.m_mapRealReplyUniqueUrl.end();
|
for (QMap<QString, int>::iterator iterPos1 = val.m_mapRealReplyUniqueUrl.begin(); iterPos1 != val.m_mapRealReplyUniqueUrl.end();
|
||||||
iterPos1++)
|
iterPos1++)
|
||||||
{
|
{
|
||||||
streal.m_mapRealReplyUniqueUrl.insertMulti(iterPos1.key(), iterPos1.value());
|
if (streal.m_mapRealReplyUniqueUrl.contains(iterPos1.key()) && streal.m_mapRealReplyUniqueUrl.values(iterPos1.key()).contains(iterPos1.value()))
|
||||||
|
continue;
|
||||||
|
else
|
||||||
|
streal.m_mapRealReplyUniqueUrl.insertMulti(iterPos1.key(), iterPos1.value());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -110,7 +110,8 @@ void SInitializer::updateWebDBInfoComplete(QSqlDatabase _db, QString _mindate, Q
|
|||||||
"replycount = " + QString::number(_replycount)+" , "
|
"replycount = " + QString::number(_replycount)+" , "
|
||||||
"mindate = '" + _mindate + "',"
|
"mindate = '" + _mindate + "',"
|
||||||
"maxdate = '" + _maxdate + "',"
|
"maxdate = '" + _maxdate + "',"
|
||||||
"lastupdate = '" + _lastupdate + "' "
|
//"lastupdate = '" + _lastupdate + "' "
|
||||||
|
"lastupdate = '" + QDateTime::currentDateTime().toString("yyyy-MM-dd hh:mm:ss") + "' "
|
||||||
"where company_num = " + QString::number(_nCompany);
|
"where company_num = " + QString::number(_nCompany);
|
||||||
_db.exec(strQuery.toUtf8());
|
_db.exec(strQuery.toUtf8());
|
||||||
}
|
}
|
||||||
@@ -154,12 +155,12 @@ bool SInitializer::initSpammer()
|
|||||||
settings.beginGroup(QString("spammers"));
|
settings.beginGroup(QString("spammers"));
|
||||||
|
|
||||||
if (settings.childKeys().size() == 0) return false;
|
if (settings.childKeys().size() == 0) return false;
|
||||||
m_adSpammerParam[E_SPAMER_BODY_COUNT_CUT] = settings.value("body_cut").toDouble();
|
m_adSpammerParam[E_SPAMER_BODY_COUNT_CUT] = settings.value("body_cut").toDouble();
|
||||||
m_adSpammerParam[E_SPAMER_BODY_COUNT_RATIO] = settings.value("body_ratio").toDouble();
|
m_adSpammerParam[E_SPAMER_BODY_COUNT_RATIO] = settings.value("body_ratio").toDouble();
|
||||||
m_adSpammerParam[E_SPAMER_REPLY_COUNT_CUT] = settings.value("reply_cut").toDouble();
|
m_adSpammerParam[E_SPAMER_REPLY_COUNT_CUT] = settings.value("reply_cut").toDouble();
|
||||||
m_adSpammerParam[E_SPAMER_REPLY_COUNT_RATIO] = settings.value("reply_ratio").toDouble();
|
m_adSpammerParam[E_SPAMER_REPLY_COUNT_RATIO] = settings.value("reply_ratio").toDouble();
|
||||||
m_adSpammerParam[E_SPAMER_NICK_COUNT_CUT] = settings.value("nick_cut").toDouble();
|
m_adSpammerParam[E_SPAMER_NICK_COUNT_CUT] = settings.value("nick_cut").toDouble();
|
||||||
m_adSpammerParam[E_SPAMER_NICK_COUNT_RATIO] = settings.value("nick_ratio").toDouble();
|
m_adSpammerParam[E_SPAMER_NICK_COUNT_RATIO] = settings.value("nick_ratio").toDouble();
|
||||||
settings.endGroup();
|
settings.endGroup();
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
|||||||
@@ -156,7 +156,10 @@ bool SPowercafe::makeOverallCategory(int _categoryNum)
|
|||||||
for (QMap<QString, int>::iterator iterPos1 = val.m_mapRealReplyUniqueUrl.begin(); iterPos1 != val.m_mapRealReplyUniqueUrl.end();
|
for (QMap<QString, int>::iterator iterPos1 = val.m_mapRealReplyUniqueUrl.begin(); iterPos1 != val.m_mapRealReplyUniqueUrl.end();
|
||||||
iterPos1++)
|
iterPos1++)
|
||||||
{
|
{
|
||||||
streal.m_mapRealReplyUniqueUrl.insertMulti(iterPos1.key(), iterPos1.value());
|
if (streal.m_mapRealReplyUniqueUrl.contains(iterPos1.key()) && streal.m_mapRealReplyUniqueUrl.values(iterPos1.key()).contains(iterPos1.value()))
|
||||||
|
continue;
|
||||||
|
else
|
||||||
|
streal.m_mapRealReplyUniqueUrl.insertMulti(iterPos1.key(), iterPos1.value());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -150,7 +150,10 @@ bool SSpammer::makeOverallCategory(int _categoryNum)
|
|||||||
for (QMap<QString, int>::iterator iterPos1 = val.m_mapRealReplyUniqueUrl.begin(); iterPos1 != val.m_mapRealReplyUniqueUrl.end();
|
for (QMap<QString, int>::iterator iterPos1 = val.m_mapRealReplyUniqueUrl.begin(); iterPos1 != val.m_mapRealReplyUniqueUrl.end();
|
||||||
iterPos1++)
|
iterPos1++)
|
||||||
{
|
{
|
||||||
streal.m_mapRealReplyUniqueUrl.insertMulti(iterPos1.key(), iterPos1.value());
|
if (streal.m_mapRealReplyUniqueUrl.contains(iterPos1.key()) && streal.m_mapRealReplyUniqueUrl.values(iterPos1.key()).contains(iterPos1.value()))
|
||||||
|
continue;
|
||||||
|
else
|
||||||
|
streal.m_mapRealReplyUniqueUrl.insertMulti(iterPos1.key(), iterPos1.value());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -222,7 +225,7 @@ void SSpammer::STSpammer::makeTable(SDatagroup &_datagroup, const QMap<int, stRe
|
|||||||
stSpammer stspammer;
|
stSpammer stspammer;
|
||||||
stspammer.bodycount = iterPos1->body;
|
stspammer.bodycount = iterPos1->body;
|
||||||
stspammer.replycount = iterPos1->reply;
|
stspammer.replycount = iterPos1->reply;
|
||||||
stspammer.nicknamecount = iterPos1->nickname.size();
|
stspammer.nicknamecount = iterPos1->nickname.size() - 1;
|
||||||
stspammer.id_num = iterPos1->id_num;
|
stspammer.id_num = iterPos1->id_num;
|
||||||
stspammer.id_id = iterPos1.key().split(",").at(0);
|
stspammer.id_id = iterPos1.key().split(",").at(0);
|
||||||
stspammer.category_num = iterPos.key();
|
stspammer.category_num = iterPos.key();
|
||||||
@@ -419,7 +422,7 @@ void SSpammer::SStatsSpaSpammerRank::makeTable(SDatagroup &_datagroup, const QMa
|
|||||||
ststats.bodycount = iterPos1->body;
|
ststats.bodycount = iterPos1->body;
|
||||||
ststats.replycount = iterPos1->reply;
|
ststats.replycount = iterPos1->reply;
|
||||||
ststats.id_num = iterPos1->id_num;
|
ststats.id_num = iterPos1->id_num;
|
||||||
ststats.nicknamecount = iterPos1->nickname.size();
|
ststats.nicknamecount = iterPos1->nickname.size() - 1;
|
||||||
ststats.id_id = iterPos1.key().split(",").at(0);
|
ststats.id_id = iterPos1.key().split(",").at(0);
|
||||||
ststats.platformname_name = iterPos1.key().split(",").at(1);
|
ststats.platformname_name = iterPos1.key().split(",").at(1);
|
||||||
ststats.subject = "spammervalue";
|
ststats.subject = "spammervalue";
|
||||||
|
|||||||
Reference in New Issue
Block a user