From 6b3521002a7a15bfba65ca887a39075dd94e2925 Mon Sep 17 00:00:00 2001 From: admin Date: Thu, 24 Dec 2015 01:39:40 +0000 Subject: [PATCH] =?UTF-8?q?=EB=84=A4=EC=9D=B4=EB=B2=84=20=EB=89=B4?= =?UTF-8?q?=EC=8A=A4=20=EC=88=98=EC=A0=95=20=EB=84=A4=EC=9D=B4=EB=B2=84=20?= =?UTF-8?q?=EB=B8=94=EB=A1=9C=EA=B7=B8=20=EB=B3=B8=EB=AC=B8=20=EC=88=98?= =?UTF-8?q?=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit git-svn-id: svn://192.168.0.12/source@231 8346c931-da38-4b9b-9d4c-e48b93cbd075 --- CrawlerProcess/scrawler.cpp | 370 +++++++++++++++++++++++++++++++- CrawlerProcess/scrawler.h | 13 +- CrawlerProcess/scrawlerdata.cpp | 7 +- CrawlerProcess/scrawlerdata.h | 6 +- 4 files changed, 378 insertions(+), 18 deletions(-) diff --git a/CrawlerProcess/scrawler.cpp b/CrawlerProcess/scrawler.cpp index 79eb32f..70772d2 100644 --- a/CrawlerProcess/scrawler.cpp +++ b/CrawlerProcess/scrawler.cpp @@ -5,11 +5,12 @@ #include #include #include +#include using namespace std; -const int BLOG_RETRY_MAX = 4; -const int BLOG_RETRY_INTERVAL = 3000; +const int RETRY_MAX = 4; +const int RETRY_INTERVAL = 3000; struct SProxyList { @@ -20,7 +21,7 @@ struct SProxyList SCrawler::SCrawler():QObject() { m_page = new QWebPage; - m_nBlogRetryCount = 0; + m_nRetryCount = 0; connect(m_page, SIGNAL(loadFinished(bool)), this, SLOT(saveResult(bool))); } @@ -79,6 +80,20 @@ void SCrawler::load(QStringList _strlistArgv) m_nSelect = E_NAVER_BLOG_REPLY; } + if (_strlistArgv[1] == "news_data") + { + m_strUrl = _strlistArgv[2]; + m_nSelect = E_NAVER_NEWS_DATA; + m_strKeywordID = _strlistArgv[4]; + } + + if (_strlistArgv[1] == "news_comm") + { + m_strUrl = _strlistArgv[2]; + m_nSelect = E_NAVER_NEWS_REPLY; + } + + if (_strlistArgv.size() > 3) m_strTable = "data_" + _strlistArgv[3]; } @@ -173,10 +188,27 @@ void SCrawler::saveResult(bool ok) emit finished(); return; } - + qDebug() << "load complete"; switch(m_nSelect) { case E_NAVER_NEWS_LIST:saveFrameNewsList(m_page->mainFrame());break; + case E_NAVER_NEWS_DATA: + { + static bool loaded = false; + if(!loaded) + { + saveFrameNewsUrl(m_page->mainFrame()); + bodydata.sendDB(); + } + loaded = true; + break; + } + case E_NAVER_NEWS_REPLY: + { + if(!saveFrameNewsComment(m_page->mainFrame())) + return; + break; + } case E_NAVER_CAFE_LIST:saveFrameCafeList(m_page->mainFrame());break; case E_NAVER_CAFE_DATA: { @@ -236,6 +268,7 @@ void SCrawler::saveResult(bool ok) break; case E_NAVER_BLOG_REPLY: + case E_NAVER_NEWS_REPLY: case E_DAUM_BLOG_REPLY: if (m_bUse) { @@ -247,6 +280,7 @@ void SCrawler::saveResult(bool ok) case E_NAVER_BLOG_BODY: case E_DAUM_CAFE_DATA: case E_DAUM_BLOG_BODY: + case E_NAVER_NEWS_DATA: if (m_bUse == false) { cout << "fail"; @@ -585,7 +619,8 @@ bool SCrawler::saveFrameUrl(QWebFrame *frame) } { - QWebElement body = Find(post,"div","class","post-view pcol2 _param(1)"); + //QWebElement body = Find(post,"div","class","post-view pcol2 _param(1)"); + QWebElement body = post.findFirst("div[class^='post-view pcol2 _param(1)']"); if (body.toPlainText().isEmpty()==false) { str[E_DATA_DATA] = body.toPlainText(); @@ -612,11 +647,11 @@ bool SCrawler::saveFrameUrl(QWebFrame *frame) //retry if profile is empty and sympathy is empty - if((strProfile.isEmpty() || sympathy.isEmpty()) && (m_nBlogRetryCount < BLOG_RETRY_MAX)) + if((strProfile.isEmpty() || sympathy.isEmpty()) && (m_nRetryCount < RETRY_MAX)) { - m_nBlogRetryCount++; - qDebug() << m_nBlogRetryCount; - QTimer::singleShot(BLOG_RETRY_INTERVAL, this, SLOT(crawlBlog())); + m_nRetryCount++; + qDebug() << m_nRetryCount; + QTimer::singleShot(RETRY_INTERVAL, this, SLOT(reloadPage())); return false; } } @@ -661,7 +696,7 @@ bool SCrawler::saveFrameUrl(QWebFrame *frame) return b_ok; } -void SCrawler::crawlBlog() +void SCrawler::reloadPage() { saveResult(true); } @@ -1076,7 +1111,6 @@ void SCrawler::saveFrameDaumCafeList(QWebFrame *frame) { if (eleSubUrl.attribute("class") == "f_url") strUrl = eleSubUrl.attribute("href"); - } if (strUrl.split("/").at(2) == "cafe.daum.net") @@ -1480,6 +1514,320 @@ void SCrawler::saveFrameNewsList(QWebFrame *frame) m_bUse = true; } + +void SCrawler::saveFrameNewsUrl(QWebFrame *frame) +{ + if (m_bUse) return; + + { + QString strQuery = "delete from "; + strQuery += m_strTable + " where article_url = '"; + strQuery += m_strUrl + "'"; + QSqlQuery query; + if(query.exec(strQuery.toUtf8()) == false) + { + cout << query.lastError().text().toStdString(); + cout << query.lastQuery().toStdString(); + } + } + + + + QString strTitle,strDate,strData,strPlatID,strPlatTitle,strlike; + { + QWebElement element = Find(frame->documentElement(),"div","class","article_info"); + { + strTitle = Find(element,"h3","id","articleTitle").toPlainText(); // Title; + strDate = Find(element,"span","class","t11").toPlainText(); // Date + } + strData = Find(frame->documentElement(),"div","id","articleBodyContents").toPlainText(); + strlike = Find(frame->documentElement(),"div","class","u_likeit_module").toPlainText(); + //e​ntertainment + if (strTitle.isEmpty()) + { + QWebElement elementTitle = Find(frame->documentElement(),"div","class","end_ct_area"); + strTitle = Find(elementTitle,"p","class","end_tit").toPlainText(); + } + //e​ntertainment + if (strDate.isEmpty()) strDate = Find(element,"em").toPlainText(); + if (strData.isEmpty()) strData = Find(frame->documentElement(),"div","id","articeBody").toPlainText(); + + if (strlike.isEmpty() == false) strData += "\r\nlike(" + QString::number(strlike.toInt()) + ")"; + else + { + //Debug("out.html",m_page->mainFrame()->toHtml()); + } + + element = Find(frame->documentElement(),"div","class","press_logo"); + { + strPlatID = Find(element,"a").attribute("href"); + strPlatTitle = Find(element,"img").attribute("alt"); + QStringList strlistPlat = strPlatID.split("."); + if(strlistPlat.size() > 2) + { + if (strlistPlat.at(0) == QString("http://www")) + strPlatID = strlistPlat.at(1); + } + } + } + bodydata.setTable(m_strTable); + bodydata.setData(bodydata.GetSafeUtf(strTitle), SCrawlerData::ARTICLE_TITLE); + bodydata.setData(bodydata.GetSafeUtf(strData), SCrawlerData::ARTICLE_DATA); + bodydata.setData(strPlatID,SCrawlerData::PLATFORM_ID); + bodydata.setData(strPlatTitle,SCrawlerData::PLATFORM_TITLE); + bodydata.setData(strDate, SCrawlerData::ARTICLE_DATE); + bodydata.setData("naver", SCrawlerData::PLATFORM_NAME); + bodydata.setData("news", SCrawlerData::PLATFORM_FORM); + bodydata.setData("body", SCrawlerData::ARTICLE_FORM); + bodydata.setData(m_strUrl, SCrawlerData::ARTICLE_URL); + bodydata.setData(m_strKeywordID, SCrawlerData::KEYWORD_ID); + + m_bUse = true; + +} + +bool SCrawler::saveFrameNewsComment(QWebFrame *frame) +{ + if (m_bUse) return true; + static bool bReplyDone = false; + static bool bReplyReplyDone = false; + static int reply_index = 0; + + qDebug() << "executed"; + + QWebElement a = Find(frame->documentElement(), "a", "class", "u_cbox_btn_more __cbox_page_button"); + + if(!a.isNull()) + { + + while(!bReplyDone) + { + QWebElement current = Find(a, "em", "class", "u_cbox_page_on __cbox_page_current"); + QWebElement total = Find(a, "em", "class", "u_cbox_page_total __cbox_page_total"); + QString str_current = current.toPlainText(); + QString str_total = total.toPlainText(); + bool ok; + + int n_current = str_current.replace(",", "").toInt(&ok); + if(!ok) + break; + + int n_total = str_total.replace(",", "").toInt(&ok); + if(!ok) + break; + + if(n_current >= n_total) + { + bReplyDone = true; + break; + } + a.evaluateJavaScript("var evObj = new Event('click', {bubbles: true, cancelable: true, view: window}); this.dispatchEvent(evObj);"); + QTimer::singleShot(100, this, SLOT(reloadPage())); + qDebug() << "load comments"; + return false; + } + + QWebElementCollection reply_btns = frame->findAllElements("a[class='u_cbox_btn_reply']"); + for(;reply_index < reply_btns.count() ; reply_index++) + { + QWebElement btn = Find(reply_btns[reply_index], "span", "class", "u_cbox_reply_cnt"); + if(btn.isNull()) + continue; + else + { + btn.evaluateJavaScript("var evObj = new Event('click', {bubbles: true, cancelable: true, view: window}); this.dispatchEvent(evObj);"); + QTimer::singleShot(100, this, SLOT(reloadPage())); + //reply_index += 1; + qDebug() << reply_index; + return false; + } + } + /* + foreach(QWebElement a, reply_btns) + { + QWebElement btn = Find(a, "span", "class", "u_cbox_reply_cnt"); + if(btn.isNull()) + continue; + else + { + btn.evaluateJavaScript("var evObj = new Event('click', {bubbles: true, cancelable: true, view: window}); this.dispatchEvent(evObj);"); + //QTimer::singleShot(100, this, SLOT(reloadPage())); + qDebug() << "qq"; + //return false; + + } + } + */ + } + else + { + if(m_nRetryCount < RETRY_MAX) + { + m_nRetryCount++; + qDebug() << m_nRetryCount; + QTimer::singleShot(RETRY_INTERVAL, this, SLOT(reloadPage())); + return false; + } + else + { + m_bUse = true; + return true; + } + } + + { + QWebElement logo = Find(frame->documentElement(),"div","class","press_logo"); + QString strPlatID, strPlatTitle; + { + strPlatID = Find(logo,"a").attribute("href"); + strPlatTitle = Find(logo,"img").attribute("alt"); + } + QStringList strlistPlat = strPlatID.split("."); + if(strlistPlat.size() > 2) + { + if (strlistPlat.at(0) == QString("http://www")) + strPlatID = strlistPlat.at(1); + } + //QWebElement ul = frame->findFirstElement("div[class='u_cbox_content_wrap']>ul[class='u_cbox_list']"); + QWebElementCollection lis = frame->findAllElements("div[class='u_cbox_content_wrap']>ul[class='u_cbox_list']>li"); + int order = 0; + foreach(QWebElement li, lis) + { + qDebug() << "li"; + QWebElement comment_box = li.findFirst("div[class='u_cbox_comment_box']"); + QString strParent; + { + QString strID, strNick, strData, strLike, strDislike, strDate; + strData = Find(comment_box, "span", "class", "u_cbox_contents").toPlainText(); + strNick = strParent = strID = Find(comment_box, "span", "class", "u_cbox_name").toPlainText(); + strLike = Find(comment_box, "em", "class", "u_cbox_cnt_recomm").toPlainText().replace(",", ""); + strDislike = Find(comment_box, "em", "class", "u_cbox_cnt_unrecomm").toPlainText().replace(",", ""); + strData += "\n(goodCount:" + strLike +")\n(badCount:" + strDislike + ")"; + + strDate = Find(comment_box, "span", "class", "u_cbox_date").toPlainText(); + if(strDate.contains(":")) + strDate += ":00"; + else + { + QDateTime current_time = QDateTime::currentDateTime(); + QRegExp rx("(\\d+)"); + int pos = 0; + QString strTime; + while ((pos = rx.indexIn(strDate, pos)) != -1) + { + strTime = rx.cap(1); + pos += rx.matchedLength(); + } + + if(strDate.contains("시간")) + { + current_time = current_time.addSecs(-(60 * 60 * strTime.toInt())); + } + else if(strDate.contains("일")) + { + current_time = current_time.addDays(-(strTime.toInt())); + } + else if(strDate.contains("분")) + { + current_time = current_time.addDays(-(60 * strTime.toInt())); + } + else + { + ; + } + strDate = current_time.toString("yyyy-MM-dd hh:mm:ss"); + qDebug() << strDate; + } + { + QSqlQuery query; + query.prepare(QString("insert into " + m_strTable + + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_order,platform_id,platform_title, article_date) " + "VALUES ('naver','news','reply',:URL,:ID,:NICK,:DATA,:ROWNUM,:PLATFORMID,:TITLE,:DATE)").toUtf8()); + + query.bindValue(":URL",m_strUrl.replace("&m_view=1","").toUtf8()); + query.bindValue(":ID",strID.toUtf8()); + query.bindValue(":NICK",strNick.toUtf8()); + query.bindValue(":DATA",strData.toUtf8()); + query.bindValue(":ROWNUM",order++); + query.bindValue(":PLATFORMID",strPlatID.toUtf8()); + query.bindValue(":TITLE",strPlatTitle.toUtf8()); + query.bindValue(":DATE", strDate.toUtf8()); + if (query.exec()==false) + cout << "error : " << query.lastError().text().toStdString(); + } + } + QWebElement reply_area = li.findFirst("div[class='u_cbox_reply_area']"); + QWebElementCollection sub_lis = reply_area.findAll("ul[class='u_cbox_list']>li"); + + foreach(QWebElement sub_li, sub_lis) + { + QString strID, strNick, strData, strDate; + strData = Find(sub_li, "span", "class", "u_cbox_contents").toPlainText(); + strNick = strID = Find(sub_li, "span", "class", "u_cbox_name").toPlainText(); + strDate = Find(comment_box, "span", "class", "u_cbox_date").toPlainText(); + if(strDate.contains(":")) + strDate += ":00"; + else + { + QDateTime current_time = QDateTime::currentDateTime(); + QRegExp rx("(\\d+)"); + int pos = 0; + QString strTime; + while ((pos = rx.indexIn(strDate, pos)) != -1) + { + strTime = rx.cap(1); + pos += rx.matchedLength(); + } + + if(strDate.contains("시간")) + { + current_time = current_time.addSecs(-(60 * 60 * strTime.toInt())); + } + else if(strDate.contains("일")) + { + current_time = current_time.addDays(-(strTime.toInt())); + } + else if(strDate.contains("분")) + { + current_time = current_time.addDays(-(60 * strTime.toInt())); + } + else + { + ; + } + strDate = current_time.toString("yyyy-MM-dd hh:mm:ss"); + } + + { + QSqlQuery query; + query.prepare(QString("insert into " + m_strTable + + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_order,platform_id,platform_title, article_date, article_parent) " + "VALUES ('naver','news','reply',:URL,:ID,:NICK,:DATA,:ROWNUM,:PLATFORMID,:TITLE,:DATE,:PARENT)").toUtf8()); + + query.bindValue(":URL",m_strUrl.replace("&m_view=1","").toUtf8()); + query.bindValue(":ID",strID.toUtf8()); + query.bindValue(":NICK",strNick.toUtf8()); + query.bindValue(":DATA",strData.toUtf8()); + query.bindValue(":ROWNUM",order++); + query.bindValue(":PLATFORMID",strPlatID.toUtf8()); + query.bindValue(":TITLE",strPlatTitle.toUtf8()); + query.bindValue(":DATE", strDate.toUtf8()); + query.bindValue(":PARENT", strParent.toUtf8()); + if (query.exec()==false) + cout << "error : " << query.lastError().text().toStdString(); + + } + } + } + qDebug() << "lis count: " << lis.count(); + } + + //Debug("c:\\data\\replytest.html", frame->toHtml()); + m_bUse = true; + return true; +} + + QWebElement SCrawler::Find(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind) { QWebElementCollection elements = _FindElement.findAll(_strElement); diff --git a/CrawlerProcess/scrawler.h b/CrawlerProcess/scrawler.h index 1f3c071..984caea 100644 --- a/CrawlerProcess/scrawler.h +++ b/CrawlerProcess/scrawler.h @@ -19,7 +19,9 @@ public: E_DAUM_BLOG_LIST, E_DAUM_BLOG_BODY, E_DAUM_BLOG_REPLY, - E_NAVER_NEWS_LIST + E_NAVER_NEWS_LIST, + E_NAVER_NEWS_DATA, + E_NAVER_NEWS_REPLY, }; public: SCrawler(); @@ -31,7 +33,7 @@ signals: void finished(); private slots: void saveResult(bool ok); - void crawlBlog(); + void reloadPage(); private: int m_nSelect; QString m_strReper; @@ -50,7 +52,7 @@ private: bool m_bNothing; QString m_strProxyIP; int m_nProxyPort; - int m_nBlogRetryCount; + int m_nRetryCount; QString SqlString(QString _str); QString GetSafeUtf(QString _strData); void saveFrameList(QWebFrame *frame); @@ -64,13 +66,15 @@ private: void saveFrameDaumBlogComment(QWebFrame *frame); void saveFrameDaumCafeUrl(QWebFrame *frame); void saveFrameNewsList(QWebFrame *frame); + void saveFrameNewsUrl(QWebFrame *frame); + bool saveFrameNewsComment(QWebFrame *frame); int GetNumber(QString _str); bool getProxyList(QString &_str); void setProxy(); void deleteProxy(); - QWebElement Find(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind); + QWebElement Find(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib="",const QString _strFind=""); QWebElement FindMid(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind, const int _strStart, const int _strLength); QWebElement FindMid(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind, const int _strStart); QWebElement FindRight(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind); @@ -80,4 +84,5 @@ private: void UpdateError(QString _strError); }; + #endif // SCRAWLER_H diff --git a/CrawlerProcess/scrawlerdata.cpp b/CrawlerProcess/scrawlerdata.cpp index 764d96a..1e47e60 100644 --- a/CrawlerProcess/scrawlerdata.cpp +++ b/CrawlerProcess/scrawlerdata.cpp @@ -4,6 +4,7 @@ #include #include #include +#include using namespace std; SCrawlerData::SCrawlerData() { @@ -26,6 +27,7 @@ SCrawlerData::SCrawlerData() m_strColumn[PLATFORM_NAME] = "platform_name"; m_strColumn[PLATFORM_TITLE] = "platform_title"; m_strColumn[REPLY_URL] = "reply_url"; + //m_strColumn[ETC] = "etc"; } SCrawlerData::~SCrawlerData() @@ -94,12 +96,15 @@ bool SCrawlerData::sendDB() { if(i == ARTICLE_ORDER) query.bindValue(QString(":" + m_strColumn[i]), m_strData[i].trimmed().toInt()); - query.bindValue(QString(":" + m_strColumn[i]), m_strData[i].trimmed().toUtf8()); + else + query.bindValue(QString(":" + m_strColumn[i]), m_strData[i].trimmed().toUtf8()); } + if (query.exec()==false) { cout << "error : " << query.lastError().text().toStdString(); + cout << endl << query.lastQuery().toStdString() << endl ; return false; } return true; diff --git a/CrawlerProcess/scrawlerdata.h b/CrawlerProcess/scrawlerdata.h index 09962e8..e338123 100644 --- a/CrawlerProcess/scrawlerdata.h +++ b/CrawlerProcess/scrawlerdata.h @@ -28,7 +28,8 @@ public: PLATFORM_ID, KEYWORD_ID, REPLY_URL, - TOTAL_COUNT + //ETC, + TOTAL_COUNT, }; private: @@ -37,7 +38,6 @@ private: QString m_strTable; private: - QString GetSafeUtf(QString _strData); QString getTable(); public: @@ -45,6 +45,8 @@ public: ~SCrawlerData(); QStringList GetNumber(QString _str); QString getData(int _num); + QString GetSafeUtf(QString _strData); + void setData(QString _str, int _num); void clear(); void clear(int _num);