From bf78651baa8caf35cc9be61644eb74d70763565a Mon Sep 17 00:00:00 2001 From: admin Date: Fri, 10 Jun 2016 10:12:15 +0000 Subject: [PATCH] =?UTF-8?q?=EC=9D=B8=EC=8A=A4=ED=83=80=EA=B7=B8=EB=9E=A8?= =?UTF-8?q?=20=ED=81=AC=EB=A1=A4=EB=9F=AC=20=EC=88=98=EC=A0=95=20sfilterpr?= =?UTF-8?q?ocess=20=EC=9E=A1=EB=8B=A4=EB=B2=84=EA=B7=B8=20=EB=94=94?= =?UTF-8?q?=EB=B2=84=EA=B9=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit git-svn-id: svn://192.168.0.12/source@277 8346c931-da38-4b9b-9d4c-e48b93cbd075 --- WebBasedCrawler/insta/instacrawl.py | 51 ++++++++++++++++---------- WebBasedCrawler/kakao/kakaocrawl.py | 6 ++- sfilterprocess/sconsumer.cpp | 5 ++- sfilterprocess/sdatagroup.cpp | 1 + sfilterprocess/sfilterprocess.pro.user | 4 +- sfilterprocess/sinfluencer.cpp | 5 ++- sfilterprocess/sinitializer.cpp | 13 ++++--- sfilterprocess/spowercafe.cpp | 5 ++- sfilterprocess/sspammer.cpp | 9 +++-- 9 files changed, 64 insertions(+), 35 deletions(-) diff --git a/WebBasedCrawler/insta/instacrawl.py b/WebBasedCrawler/insta/instacrawl.py index 43bf235..8c476b1 100644 --- a/WebBasedCrawler/insta/instacrawl.py +++ b/WebBasedCrawler/insta/instacrawl.py @@ -107,8 +107,8 @@ class InstaBodyCrawler: def find_article_data(self): ul = self.article.find_element_by_xpath("div/ul") try: - li = ul.find_element_by_css_selector("li[data-reactid$='.0']") - span = li.find_element_by_xpath("h1/span") + #li = ul.find_element_by_css_selector("li[data-reactid$='.0']") + span = ul.find_element_by_css_selector("li h1>span") return span.text except: return "" @@ -132,27 +132,32 @@ class InstaBodyCrawler: return None def find_like_num(self): - div = self.article.find_element_by_xpath("div/section/div[@data-reactid]") + div = self.article.find_element_by_xpath("div/section[1]/div") try: - span = div.find_element_by_css_selector("span[data-reactid$='.1'") + span = div.find_element_by_xpath("span/span") str_num = span.text + str_num = str_num.replace(',', '') if str_num[-1] == 'm': num = float(str_num[0:-1]) * 1000000 elif str_num[-1] == 'k': num = float(str_num[0:-1]) * 1000 else: - num = int(str_num) + num = int(str_num) return str(num) except: a_list = div.find_elements_by_tag_name("a") if len(a_list) > 1: return str(len(a_list)) else: - span = div.find_element_by_xpath("span[1]") - if len(span.text.strip()) < 1: + if a_list and a_list[0].get_attribute('title'): return str(1) else: return str(0) + # span = div.find_element_by_xpath("span[1]") + # if len(span.text.strip()) < 1: + # return str(1) + # else: + # return str(0) def find_reply_num(self): ul = self.article.find_element_by_xpath("div/ul") @@ -179,12 +184,13 @@ class InstaBodyCrawler: content["article_data"] = self.find_article_data() content["article_form"] = 'body' content["platform_form"] = 'post' + content["platform_title"] = content["article_id"] reply_num = self.find_reply_num() if int(reply_num) > 0: content["article_order"] = int(reply_num) like_num = self.find_like_num() if int(float(like_num)) > 0: - content["article_hit"] = int(float(like_num)) + content["reply_url"] = int(float(like_num)) return content def find_platform_title(self): @@ -193,6 +199,7 @@ class InstaBodyCrawler: def find_article_title(self): pass + class InstaReplyCrawler: def __init__(self, driver=None, article=None): self.driver = driver @@ -217,24 +224,28 @@ class InstaReplyCrawler: def has_more(self, ul): try: - li = ul.find_element_by_css_selector("li[data-reactid$='.1']") + button = ul.find_element_by_css_selector("li>button") return True except Exception as e: return False def read_more_reply(self, ul): try: - button = ul.find_element_by_css_selector("li[data-reactid$='.1']>button") + button = ul.find_element_by_css_selector("li>button") enter_element(button) except Exception as e: print_and_flush(e) def read_all_reply(self, ul): - for i in range(0, 10): - if self.has_more(ul): - self.read_more_reply(ul) - else: - break + i = 0 + while i < 200 and self.has_more(ul): + self.read_more_reply(ul) + i += 1 + # for i in range(0, 10): + # if self.has_more(ul): + # self.read_more_reply(ul) + # else: + # break def get_reply_ul(self): ul = self.article.find_element_by_xpath("div/ul") @@ -242,7 +253,7 @@ class InstaReplyCrawler: def has_reply(self, ul): try: - lis = ul.find_elements_by_css_selector("li[data-reactid*='comment']") + lis = ul.find_elements_by_css_selector("li>a") if len(lis) > 0: return True except: @@ -294,7 +305,7 @@ class InstaReplyCrawler: def find_article_data(self, ul): data_list = list() - span_list = ul.find_elements_by_css_selector("li[data-reactid*='comment']>span") + span_list = ul.find_elements_by_css_selector("li>span") for i in span_list: data_list.append(i.text) return data_list @@ -378,7 +389,9 @@ class InstaPageCrawler: def has_first_page(self): try: - a = find_element_by_css_selector(self.driver, "html>body>span>section>main>article>div[data-reactid*='mostRecentSection']>div>div[data-reactid$='.$0']>a", 60) + #a = find_element_by_css_selector(self.driver, "html>body>span>section>main>article>div[data-reactid*='mostRecentSection']>div>div[data-reactid$='.$0']>a", 60) + #a = find_element_by_css_selector(self.driver, "a[class='_8mlbc _vbtk2 _t5r8b']") + a = find_element_by_css_selector(self.driver, "div[class='_8fxp6'] a[class='_8mlbc _vbtk2 _t5r8b']") enter_element(a) return True except: @@ -538,4 +551,4 @@ class InstaMainCrawler: real_time = self.crawl_init.is_realtime() print_and_flush("Finished Crawling :)") self.send_to_db.close() - self.driver.quit() + #self.driver.quit() diff --git a/WebBasedCrawler/kakao/kakaocrawl.py b/WebBasedCrawler/kakao/kakaocrawl.py index 9f79aa2..f156549 100644 --- a/WebBasedCrawler/kakao/kakaocrawl.py +++ b/WebBasedCrawler/kakao/kakaocrawl.py @@ -115,13 +115,15 @@ class KakaoBodyCrawler: return "body" def find_article_data(self): - more = self.activity.find_element_by_xpath("div/div[@class='fd_cont _contentWrapper']/p[@class='more _moreBtnContainer']") + more = self.activity.find_element_by_xpath("div/div[@class='fd_cont _contentWrapper']" + "/p[@class='more _moreBtnContainer']") display = more.get_attribute("style") if display.find('none') == -1: a = more.find_element_by_tag_name("a") self.enter_element(a) try: - content = self.activity.find_element_by_xpath("div/div[@class='fd_cont _contentWrapper']/div[@class='txt_wrap _content']") + content = self.activity.find_element_by_xpath("div/div[@class='fd_cont _contentWrapper']" + "/div[@class='txt_wrap']/div[@class='_content']") except: return str("") return content.text diff --git a/sfilterprocess/sconsumer.cpp b/sfilterprocess/sconsumer.cpp index baa0f47..de4da9b 100644 --- a/sfilterprocess/sconsumer.cpp +++ b/sfilterprocess/sconsumer.cpp @@ -139,7 +139,10 @@ bool SConsumer::makeOverallCategory(int _nCategory) for (QMap::iterator iterPos1 = val.m_mapRealReplyUniqueUrl.begin(); iterPos1 != val.m_mapRealReplyUniqueUrl.end(); iterPos1++) { - streal.m_mapRealReplyUniqueUrl.insertMulti(iterPos1.key(), iterPos1.value()); + if (streal.m_mapRealReplyUniqueUrl.contains(iterPos1.key()) && streal.m_mapRealReplyUniqueUrl.values(iterPos1.key()).contains(iterPos1.value())) + continue; + else + streal.m_mapRealReplyUniqueUrl.insertMulti(iterPos1.key(), iterPos1.value()); } } diff --git a/sfilterprocess/sdatagroup.cpp b/sfilterprocess/sdatagroup.cpp index 09faaef..217e835 100644 --- a/sfilterprocess/sdatagroup.cpp +++ b/sfilterprocess/sdatagroup.cpp @@ -346,6 +346,7 @@ bool SDatagroup::makeTable() stNickname stnickname; stnickname.nickname = strListReply[anColumn[SInitializer::E_DATA_article_nickname]]; stnickname.num = nNickname++; + stnickname.platformname_num = m_pSInitializer->getPlatformName(strListReply[anColumn[SInitializer::E_DATA_platform_name]].trimmed()); m_SNickname.put(strListReply[anColumn[SInitializer::E_DATA_article_nickname]].trimmed(), stnickname); } } diff --git a/sfilterprocess/sfilterprocess.pro.user b/sfilterprocess/sfilterprocess.pro.user index 377f68c..50ade45 100644 --- a/sfilterprocess/sfilterprocess.pro.user +++ b/sfilterprocess/sfilterprocess.pro.user @@ -1,6 +1,6 @@ - + EnvironmentId @@ -787,7 +787,7 @@ sfilterprocess Qt4ProjectManager.Qt4RunConfiguration:C:/source/sfilterprocess/sfilterprocess.pro - "470" "test2y" + "469" "test2y" sfilterprocess.pro false true diff --git a/sfilterprocess/sinfluencer.cpp b/sfilterprocess/sinfluencer.cpp index 820b0cb..706247d 100644 --- a/sfilterprocess/sinfluencer.cpp +++ b/sfilterprocess/sinfluencer.cpp @@ -156,7 +156,10 @@ bool SInfluencer::makeOverallCategory(int _categoryNum) for (QMap::iterator iterPos1 = val.m_mapRealReplyUniqueUrl.begin(); iterPos1 != val.m_mapRealReplyUniqueUrl.end(); iterPos1++) { - streal.m_mapRealReplyUniqueUrl.insertMulti(iterPos1.key(), iterPos1.value()); + if (streal.m_mapRealReplyUniqueUrl.contains(iterPos1.key()) && streal.m_mapRealReplyUniqueUrl.values(iterPos1.key()).contains(iterPos1.value())) + continue; + else + streal.m_mapRealReplyUniqueUrl.insertMulti(iterPos1.key(), iterPos1.value()); } } diff --git a/sfilterprocess/sinitializer.cpp b/sfilterprocess/sinitializer.cpp index 2c989c0..9cf0e74 100644 --- a/sfilterprocess/sinitializer.cpp +++ b/sfilterprocess/sinitializer.cpp @@ -110,7 +110,8 @@ void SInitializer::updateWebDBInfoComplete(QSqlDatabase _db, QString _mindate, Q "replycount = " + QString::number(_replycount)+" , " "mindate = '" + _mindate + "'," "maxdate = '" + _maxdate + "'," - "lastupdate = '" + _lastupdate + "' " + //"lastupdate = '" + _lastupdate + "' " + "lastupdate = '" + QDateTime::currentDateTime().toString("yyyy-MM-dd hh:mm:ss") + "' " "where company_num = " + QString::number(_nCompany); _db.exec(strQuery.toUtf8()); } @@ -154,12 +155,12 @@ bool SInitializer::initSpammer() settings.beginGroup(QString("spammers")); if (settings.childKeys().size() == 0) return false; - m_adSpammerParam[E_SPAMER_BODY_COUNT_CUT] = settings.value("body_cut").toDouble(); - m_adSpammerParam[E_SPAMER_BODY_COUNT_RATIO] = settings.value("body_ratio").toDouble(); - m_adSpammerParam[E_SPAMER_REPLY_COUNT_CUT] = settings.value("reply_cut").toDouble(); + m_adSpammerParam[E_SPAMER_BODY_COUNT_CUT] = settings.value("body_cut").toDouble(); + m_adSpammerParam[E_SPAMER_BODY_COUNT_RATIO] = settings.value("body_ratio").toDouble(); + m_adSpammerParam[E_SPAMER_REPLY_COUNT_CUT] = settings.value("reply_cut").toDouble(); m_adSpammerParam[E_SPAMER_REPLY_COUNT_RATIO] = settings.value("reply_ratio").toDouble(); - m_adSpammerParam[E_SPAMER_NICK_COUNT_CUT] = settings.value("nick_cut").toDouble(); - m_adSpammerParam[E_SPAMER_NICK_COUNT_RATIO] = settings.value("nick_ratio").toDouble(); + m_adSpammerParam[E_SPAMER_NICK_COUNT_CUT] = settings.value("nick_cut").toDouble(); + m_adSpammerParam[E_SPAMER_NICK_COUNT_RATIO] = settings.value("nick_ratio").toDouble(); settings.endGroup(); return true; diff --git a/sfilterprocess/spowercafe.cpp b/sfilterprocess/spowercafe.cpp index b8a2a52..2849577 100644 --- a/sfilterprocess/spowercafe.cpp +++ b/sfilterprocess/spowercafe.cpp @@ -156,7 +156,10 @@ bool SPowercafe::makeOverallCategory(int _categoryNum) for (QMap::iterator iterPos1 = val.m_mapRealReplyUniqueUrl.begin(); iterPos1 != val.m_mapRealReplyUniqueUrl.end(); iterPos1++) { - streal.m_mapRealReplyUniqueUrl.insertMulti(iterPos1.key(), iterPos1.value()); + if (streal.m_mapRealReplyUniqueUrl.contains(iterPos1.key()) && streal.m_mapRealReplyUniqueUrl.values(iterPos1.key()).contains(iterPos1.value())) + continue; + else + streal.m_mapRealReplyUniqueUrl.insertMulti(iterPos1.key(), iterPos1.value()); } } diff --git a/sfilterprocess/sspammer.cpp b/sfilterprocess/sspammer.cpp index ab9bc81..6b8c7cc 100644 --- a/sfilterprocess/sspammer.cpp +++ b/sfilterprocess/sspammer.cpp @@ -150,7 +150,10 @@ bool SSpammer::makeOverallCategory(int _categoryNum) for (QMap::iterator iterPos1 = val.m_mapRealReplyUniqueUrl.begin(); iterPos1 != val.m_mapRealReplyUniqueUrl.end(); iterPos1++) { - streal.m_mapRealReplyUniqueUrl.insertMulti(iterPos1.key(), iterPos1.value()); + if (streal.m_mapRealReplyUniqueUrl.contains(iterPos1.key()) && streal.m_mapRealReplyUniqueUrl.values(iterPos1.key()).contains(iterPos1.value())) + continue; + else + streal.m_mapRealReplyUniqueUrl.insertMulti(iterPos1.key(), iterPos1.value()); } } @@ -222,7 +225,7 @@ void SSpammer::STSpammer::makeTable(SDatagroup &_datagroup, const QMapbody; stspammer.replycount = iterPos1->reply; - stspammer.nicknamecount = iterPos1->nickname.size(); + stspammer.nicknamecount = iterPos1->nickname.size() - 1; stspammer.id_num = iterPos1->id_num; stspammer.id_id = iterPos1.key().split(",").at(0); stspammer.category_num = iterPos.key(); @@ -419,7 +422,7 @@ void SSpammer::SStatsSpaSpammerRank::makeTable(SDatagroup &_datagroup, const QMa ststats.bodycount = iterPos1->body; ststats.replycount = iterPos1->reply; ststats.id_num = iterPos1->id_num; - ststats.nicknamecount = iterPos1->nickname.size(); + ststats.nicknamecount = iterPos1->nickname.size() - 1; ststats.id_id = iterPos1.key().split(",").at(0); ststats.platformname_name = iterPos1.key().split(",").at(1); ststats.subject = "spammervalue";