From 55512fe694e101310ab60f94a2842573da4e4730 Mon Sep 17 00:00:00 2001 From: admin Date: Fri, 22 Jul 2016 08:51:34 +0000 Subject: [PATCH] =?UTF-8?q?new=20crawler=20=EC=88=98=EC=A0=95=20filterproc?= =?UTF-8?q?ess=20spammer=20=EA=B7=B8=EB=9E=98=ED=94=84=EC=AA=BD=20?= =?UTF-8?q?=EB=B2=84=EA=B7=B8=20=EC=88=98=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit git-svn-id: svn://192.168.0.12/source@283 8346c931-da38-4b9b-9d4c-e48b93cbd075 --- CrawlerProcess/scrawler.cpp | 199 +++++++++++++++++++++++-- sfilterprocess/sfilterprocess.pro.user | 4 +- sfilterprocess/sspammer.cpp | 5 +- 3 files changed, 187 insertions(+), 21 deletions(-) diff --git a/CrawlerProcess/scrawler.cpp b/CrawlerProcess/scrawler.cpp index acf71f5..90c01d0 100644 --- a/CrawlerProcess/scrawler.cpp +++ b/CrawlerProcess/scrawler.cpp @@ -184,7 +184,7 @@ void SCrawler::UpdateError(QString _strError) void SCrawler::saveResult(bool ok) { - qDebug() << "saveResult"; + //qDebug() << "saveResult"; if (!ok) { @@ -781,6 +781,7 @@ bool SCrawler::saveFrameUrl(QWebFrame *frame) void SCrawler::reloadPage() { + //qDebug() << "reloadPage called"; saveResult(true); } @@ -1758,15 +1759,23 @@ bool SCrawler::saveFrameNewsComment(QWebFrame *frame) { if (m_bUse) return true; static bool bReplyDone = false; - static int reply_index = 0; + //static int reply_index = 0; + static int iLoaded = 0; + static bool bProcessed = false; - qDebug() << frame->baseUrl().toString(); - qDebug() << "executed"; + if (bProcessed) + return false; + bProcessed = true; + + + //qDebug() << frame->baseUrl().toString(); + //qDebug() << "executed"; if(frame->baseUrl().toString().contains("entertain") && !frame->baseUrl().toString().contains("comment")) { m_page->mainFrame()->load(QUrl(frame->baseUrl().toString().replace("read", "comment/list"))); + bProcessed = false; return false; } @@ -1774,16 +1783,75 @@ bool SCrawler::saveFrameNewsComment(QWebFrame *frame) if(frame->baseUrl().toString().contains("sports") && !frame->baseUrl().toString().contains("m_view=1")) { m_page->mainFrame()->load(QUrl(frame->baseUrl().toString() + "&m_view=1")); + bProcessed = false; return false; } - - - QWebElement a = Find(frame->documentElement(), "a", "class", "u_cbox_btn_more __cbox_page_button"); - - if(!a.isNull()) + if(m_nRetryCount < RETRY_MAX && !bReplyDone) { + QWebElement u_cbox_paginate = Find(frame->documentElement(), "div", "class", "u_cbox_paginate"); + if (u_cbox_paginate.isNull()) + { + ++m_nRetryCount; + // qDebug() << m_nRetryCount; + QTimer::singleShot(RETRY_INTERVAL, this, SLOT(reloadPage())); + bProcessed = false; + return false; + } + else + { + QWebElement a = Find(frame->documentElement(), "a", "class", "u_cbox_btn_more __cbox_page_button"); + if(!a.isNull()) + { + a.evaluateJavaScript("var evObj = new Event('click', {bubbles: true, cancelable: true, view: window}); this.dispatchEvent(evObj);"); + QTimer::singleShot(300, this, SLOT(reloadPage())); + // qDebug() << "load comments"; + //QWebElement a = Find(frame->documentElement(), "a", "class", "u_cbox_btn_more __cbox_page_button"); + QWebElementCollection lis = frame->findAllElements("div[class='u_cbox_content_wrap']>ul[class='u_cbox_list']>li"); + + // qDebug() << lis.count(); + if (lis.count() != iLoaded) + { + iLoaded = lis.count(); + bProcessed = false; + return false; + } + else + { + bReplyDone = true; + } + } + } + } + /* + QWebElement u_cbox_paginate = Find(frame->documentElement(), "div", "class", "u_cbox_paginate"); + //qDebug() << lis.count(); + + if (!u_cbox_paginate.isNull()) + { + QWebElement a = Find(frame->documentElement(), "a", "class", "u_cbox_btn_more __cbox_page_button"); + //QWebElementCollection lis = frame->findAllElements("div[class='u_cbox_content_wrap']>ul[class='u_cbox_list']>li"); + + if(!a.isNull()) + { + a.evaluateJavaScript("var evObj = new Event('click', {bubbles: true, cancelable: true, view: window}); this.dispatchEvent(evObj);"); + QTimer::singleShot(300, this, SLOT(reloadPage())); + qDebug() << "load comments"; + + //QWebElement a = Find(frame->documentElement(), "a", "class", "u_cbox_btn_more __cbox_page_button"); + QWebElementCollection lis = frame->findAllElements("div[class='u_cbox_content_wrap']>ul[class='u_cbox_list']>li"); + + qDebug() << lis.count(); + if (lis.count() != iLoaded) + { + iLoaded = lis.count(); + bProcessed = false; + return false; + } + } + + //return false; while(!bReplyDone) { QWebElement current = Find(a, "em", "class", "u_cbox_page_on __cbox_page_current"); @@ -1806,11 +1874,12 @@ bool SCrawler::saveFrameNewsComment(QWebFrame *frame) break; } a.evaluateJavaScript("var evObj = new Event('click', {bubbles: true, cancelable: true, view: window}); this.dispatchEvent(evObj);"); - QTimer::singleShot(100, this, SLOT(reloadPage())); + QTimer::singleShot(300, this, SLOT(reloadPage())); qDebug() << "load comments"; return false; } + QWebElementCollection reply_btns = frame->findAllElements("a[class='u_cbox_btn_reply']"); for(;reply_index < reply_btns.count() ; reply_index++) { @@ -1820,12 +1889,14 @@ bool SCrawler::saveFrameNewsComment(QWebFrame *frame) else { btn.evaluateJavaScript("var evObj = new Event('click', {bubbles: true, cancelable: true, view: window}); this.dispatchEvent(evObj);"); - QTimer::singleShot(100, this, SLOT(reloadPage())); + QTimer::singleShot(200, this, SLOT(reloadPage())); //reply_index += 1; qDebug() << reply_index; return false; } } + */ + /* foreach(QWebElement a, reply_btns) { @@ -1841,8 +1912,10 @@ bool SCrawler::saveFrameNewsComment(QWebFrame *frame) } } - */ + } + */ + /* else { if(m_nRetryCount < RETRY_MAX) @@ -1850,15 +1923,106 @@ bool SCrawler::saveFrameNewsComment(QWebFrame *frame) m_nRetryCount++; qDebug() << m_nRetryCount; QTimer::singleShot(RETRY_INTERVAL, this, SLOT(reloadPage())); + bProcessed = false; return false; } + */ + /* else { m_bUse = true; return true; } + */ + //} + /* + { + QWebElementCollection lis = frame->findAllElements("div[class='u_cbox_content_wrap']>ul[class='u_cbox_list']>li"); + foreach (QWebElement li, lis) + { + QWebElement btn = li.findFirst("span[class='u_cbox_reply_cnt']"); + QWebElement atag = li.findFirst("a[class='u_cbox_btn_reply']"); + if (!btn.isNull() && !atag.isNull()) + { + atag.evaluateJavaScript("var evObj = new Event('click', {bubbles: true, cancelable: true, view: window}); this.dispatchEvent(evObj);"); + QTimer::singleShot(300, this, SLOT(reloadPage())); + qDebug() << "click reply:" << btn.toPlainText(); + + bProcessed = false; + return false; + } + QWebElement div_load_more = li.findFirst("div[class='u_cbox_paginate']"); + if (!div_load_more.isNull()) + { + QWebElement load_more = div_load_more.findFirst("a[class='u_cbox_btn_more __cbox_page_button']"); + if (!load_more.isNull()) + { + load_more.evaluateJavaScript("var evObj = new Event('click', {bubbles: true, cancelable: true, view: window}); this.dispatchEvent(evObj);"); + QTimer::singleShot(300, this, SLOT(reloadPage())); + qDebug() << "load more reply"; + bProcessed = false; + return false; + } + } + } + } + */ + + QWebElementCollection reply_btns = frame->findAllElements("a[class^='u_cbox_btn_reply']"); + + foreach (QWebElement ele, reply_btns) + { + QWebElement btn = ele.findFirst("span[class='u_cbox_reply_cnt']"); + + if ((ele.attribute("class") == "u_cbox_btn_reply") && !btn.isNull()) + { + ele.evaluateJavaScript("var evObj = new Event('click', {bubbles: true, cancelable: true, view: window}); this.dispatchEvent(evObj);"); + QTimer::singleShot(250, this, SLOT(reloadPage())); + //qDebug() << "click reply:" << btn.toPlainText(); + + bProcessed = false; + return false; + } } + QWebElementCollection allPaginate = frame->documentElement().findAll("div[class='u_cbox_paginate']"); + foreach (QWebElement ele, allPaginate) + { + QWebElement load_more = ele.findFirst("a[class='u_cbox_btn_more __cbox_page_button']"); + if (!load_more.isNull()) + { + load_more.evaluateJavaScript("var evObj = new Event('click', {bubbles: true, cancelable: true, view: window}); this.dispatchEvent(evObj);"); + QTimer::singleShot(250, this, SLOT(reloadPage())); + //qDebug() << "load more reply"; + bProcessed = false; + return false; + } + } + +/* + //for(;reply_index < reply_btns.count() ;) + for (int k = 0; k < reply_btns.count(); ++k) + { + //QWebElement btn = Find(reply_btns[reply_index], "span", "class", "u_cbox_reply_cnt"); + QWebElement btn = Find(reply_btns[k], "span", "class", "u_cbox_reply_cnt"); + //reply_index += 1; + + if(btn.isNull()) + continue; + else + { + //QWebElement btnA = Find(reply_btns[reply_index - 1], "a", "class", "u_cbox_btn_reply"); + reply_btns[k].evaluateJavaScript("var evObj = new Event('click', {bubbles: true, cancelable: true, view: window}); this.dispatchEvent(evObj);"); + qDebug() << "load??????????????"; + + qDebug() << reply_btns.count(); + QTimer::singleShot(300, this, SLOT(reloadPage())); + bProcessed = false; + + return false; + } + } +*/ { QWebElement logo = Find(frame->documentElement(),"div","class","press_logo"); QString strPlatID, strPlatTitle; @@ -1877,7 +2041,7 @@ bool SCrawler::saveFrameNewsComment(QWebFrame *frame) int order = 0; foreach(QWebElement li, lis) { - qDebug() << "li"; + //qDebug() << "li"; QWebElement comment_box = li.findFirst("div[class='u_cbox_comment_box']"); QString strParent; { @@ -1913,14 +2077,14 @@ bool SCrawler::saveFrameNewsComment(QWebFrame *frame) } else if(strDate.contains("분")) { - current_time = current_time.addDays(-(60 * strTime.toInt())); + current_time = current_time.addSecs(-(60 * strTime.toInt())); } else { ; } strDate = current_time.toString("yyyy-MM-dd hh:mm:ss"); - qDebug() << strDate; + // qDebug() << strDate; } { QSqlQuery query; @@ -1973,7 +2137,7 @@ bool SCrawler::saveFrameNewsComment(QWebFrame *frame) } else if(strDate.contains("분")) { - current_time = current_time.addDays(-(60 * strTime.toInt())); + current_time = current_time.addSecs(-(60 * strTime.toInt())); } else { @@ -2006,8 +2170,11 @@ bool SCrawler::saveFrameNewsComment(QWebFrame *frame) qDebug() << "lis count: " << lis.count(); } + //Debug("c:\\data\\replytest.html", frame->toHtml()); m_bUse = true; + bProcessed = false; + return true; } diff --git a/sfilterprocess/sfilterprocess.pro.user b/sfilterprocess/sfilterprocess.pro.user index 952da53..b0bd132 100644 --- a/sfilterprocess/sfilterprocess.pro.user +++ b/sfilterprocess/sfilterprocess.pro.user @@ -1,6 +1,6 @@ - + EnvironmentId @@ -787,7 +787,7 @@ sfilterprocess Qt4ProjectManager.Qt4RunConfiguration:C:/source/sfilterprocess/sfilterprocess.pro - "527" "testall2" + "530" "testall2" sfilterprocess.pro false true diff --git a/sfilterprocess/sspammer.cpp b/sfilterprocess/sspammer.cpp index 8417828..f4de8cd 100644 --- a/sfilterprocess/sspammer.cpp +++ b/sfilterprocess/sspammer.cpp @@ -1159,7 +1159,7 @@ void SSpammer::SStatsJson::makeTableRank(SDatagroup &_datagroup, const QMapid_num; + mapScore[strKey].id_num = _datagroup.getstReply(iterPos1.key(), iterPos1.value())->id_num; } mapScore[strKey].reply++; mapScore[strKey].nickname.insert((*mapReply)[iterPos1.key()][iterPos1.value()][anColumn[SInitializer::E_DATA_article_nickname]].trimmed()); @@ -1272,7 +1272,7 @@ void SSpammer::SStatsJson::makeTableRank(SDatagroup &_datagroup, const QMapid_num; + mapScore[strKey].id_num = _datagroup.getstReply(iterPos1.key(), iterPos1.value())->id_num; } mapScore[strKey].reply++; mapScore[strKey].nickname.insert((*mapReply)[iterPos1.key()][iterPos1.value()][anColumn[SInitializer::E_DATA_article_nickname]].trimmed()); @@ -1358,7 +1358,6 @@ void SSpammer::SStatsJson::makeTable(SDatagroup &_datagroup, const QMap