From c55fc7f2f02d8e9f8b0848978337c0628e00c9a7 Mon Sep 17 00:00:00 2001 From: admin Date: Fri, 30 Sep 2016 07:04:26 +0000 Subject: [PATCH] =?UTF-8?q?=EC=82=AD=EC=A0=9C=EB=90=9C=20=EA=B2=8C?= =?UTF-8?q?=EC=8B=9C=EB=AC=BC=EC=97=90=20=EC=A0=91=EA=B7=BC=EC=8B=9C=20ale?= =?UTF-8?q?rt=20=EB=9C=A8=EB=8A=94=20=EB=AC=B8=EC=A0=9C=20=ED=95=B4?= =?UTF-8?q?=EA=B2=B0=20=EB=B0=8F=20=EC=98=A4=EB=A5=98=EC=B2=98=EB=A6=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit git-svn-id: svn://192.168.0.12/source@296 8346c931-da38-4b9b-9d4c-e48b93cbd075 --- CrawlerProcess/scrawler.cpp | 52 +++++++++++++++++++++++++++++++++---- 1 file changed, 47 insertions(+), 5 deletions(-) diff --git a/CrawlerProcess/scrawler.cpp b/CrawlerProcess/scrawler.cpp index 90c01d0..7e446ad 100644 --- a/CrawlerProcess/scrawler.cpp +++ b/CrawlerProcess/scrawler.cpp @@ -9,6 +9,18 @@ #include #include +class SWebPage:public QWebPage +{ +public: + SWebPage(QObject * parent = 0): QWebPage(parent){} +protected: + void javaScriptAlert(QWebFrame * frame, const QString & msg){ + std::cout << "deletedurl"; + exit(1); + } + //bool javaScriptConfirm(QWebFrame * frame, const QString & msg){} +}; + using namespace std; const int RETRY_MAX = 4; @@ -22,7 +34,7 @@ struct SProxyList SCrawler::SCrawler():QObject() { - m_page = new QWebPage; + m_page = new SWebPage; m_nRetryCount = 0; m_bProcessed = false; connect(m_page, SIGNAL(loadFinished(bool)), this, SLOT(saveResult(bool))); @@ -587,7 +599,7 @@ enum E_DATA bool SCrawler::saveFrameUrl(QWebFrame *frame) { //static int cz = 0; - // Debug(frame->frameName() + QString::number(cz++) + ".html",frame->toHtml()); + //Debug(frame->frameName() + QString::number(cz++) + ".html",frame->toHtml()); //QSqlQuery sql; if (frame->frameName().compare(QString("BuddyConnectIframe")) == 0) @@ -601,6 +613,7 @@ bool SCrawler::saveFrameUrl(QWebFrame *frame) { QString str[E_DATA_MAX]; QString sympathy; + QString numofReply; QString strProfile; QWebElement proTitle = Find(frame->documentElement(),"meta","property","og:article:author"); str[E_DATA_PLATFORM_TITLE] = proTitle.attribute("content").split("|").at(1).trimmed(); @@ -661,6 +674,15 @@ bool SCrawler::saveFrameUrl(QWebFrame *frame) } { QWebElement post = Find(frame->documentElement(),"div","id","postListBody"); + { + QWebElement weCmt = post.findFirst("a[class^='pcol2 _cmtList']"); + if (!weCmt.isNull()) + { + numofReply = weCmt.toPlainText().replace(",", "").trimmed(); + numofReply = numofReply.replace(QRegExp("[\\D]"), ""); + } + } + QWebElement post_top = Find(post,"table","class","post-top"); { QWebElement title = Find(post_top,"span","class","pcol1 itemSubjectBoldfont"); @@ -679,6 +701,7 @@ bool SCrawler::saveFrameUrl(QWebFrame *frame) } } + { QWebElement date = Find(post_top,"p","class","date fil5 pcol2 _postAddDate"); if(date.isNull()) @@ -762,8 +785,13 @@ bool SCrawler::saveFrameUrl(QWebFrame *frame) bodydata.setData(m_strUrl, bodydata.ARTICLE_URL); bodydata.setTable(m_strTable); - bodydata.setData(sympathy, bodydata.ARTICLE_HIT); + + bodydata.setData(sympathy, bodydata.ARTICLE_HIT); //original data + /* + bodydata.setData(numofReply, bodydata.ARTICLE_ORDER); + bodydata.setData(sympathy, bodydata.REPLY_URL); + */ bodydata.setData("naver", bodydata.PLATFORM_NAME); bodydata.setData("blog", bodydata.PLATFORM_FORM); bodydata.setData("body", bodydata.ARTICLE_FORM); @@ -1047,7 +1075,7 @@ void SCrawler::saveFrameCafeUrl(QWebFrame *frame) if (frame->frameName() == "cafe_main") { { - QString strData,strDate,strNick,strID,strHits,strTitle; + QString strData,strDate,strNick,strID,strHits,strTitle,strReply,strLike; { QWebElement group = Find(frame->documentElement(),"div","class","tbody m-tcol-c"); strData = SqlString(group.toPlainText().trimmed()); @@ -1096,6 +1124,15 @@ void SCrawler::saveFrameCafeUrl(QWebFrame *frame) } strHits = Find(frame->documentElement(),"span","class","b m-tcol-c reply _rosReadcount").toPlainText(); + { + strReply = frame->documentElement().findFirst("td.reply").toPlainText().replace(",", "").trimmed(); + strReply = strReply.replace(QRegExp("[\\D]"), ""); + } + { + strLike = frame->documentElement().findFirst("a#upArticleLink").toPlainText().replace(",", "").trimmed(); + } + + if (strHits.isEmpty()) { strHits = Find(frame->documentElement(),"span","class","kin_count m-tcol-c _rosReadcount").toPlainText(); @@ -1114,7 +1151,9 @@ void SCrawler::saveFrameCafeUrl(QWebFrame *frame) bodydata.setData("body", bodydata.ARTICLE_FORM); bodydata.setData(m_strUrl.split("/").at(3), bodydata.PLATFORM_ID); bodydata.setData(m_strKeywordID, bodydata.KEYWORD_ID); + bodydata.setData(strReply, bodydata.ARTICLE_ORDER); bodydata.setData(strTitle, bodydata.ARTICLE_TITLE); + bodydata.setData(strLike, bodydata.REPLY_URL); /* QSqlQuery sql; QString strQuery = "update "; @@ -2112,7 +2151,7 @@ bool SCrawler::saveFrameNewsComment(QWebFrame *frame) QString strID, strNick, strData, strDate; strData = Find(sub_li, "span", "class", "u_cbox_contents").toPlainText(); strNick = strID = Find(sub_li, "span", "class", "u_cbox_name").toPlainText(); - strDate = Find(comment_box, "span", "class", "u_cbox_date").toPlainText(); + strDate = Find(sub_li, "span", "class", "u_cbox_date").toPlainText(); if(strDate.contains(":")) strDate += ":00"; else @@ -2401,6 +2440,9 @@ void SCrawler::deleteProxy() QSqlQuery sqlquery; QString strquery = "delete from Proxy where proxy = '" + m_strProxyIP + "' and port = " + QString::number(m_nProxyPort); if(sqlquery.exec(strquery.toUtf8()) == false) + { cout << "Error : " << strquery.toStdString() << endl; + cout << sqlquery.lastError().text().toStdString() << endl; + } }