From f1629c85f3245b5eb4c8507b0d4fbbd2b542d652 Mon Sep 17 00:00:00 2001 From: admin Date: Thu, 18 Jun 2015 08:43:34 +0000 Subject: [PATCH] =?UTF-8?q?=EB=89=B4=EC=8A=A4=20=EC=BD=94=EB=A5=BC?= =?UTF-8?q?=EB=A7=81=20=EA=B8=B0=EB=8A=A5=20=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit git-svn-id: svn://192.168.0.12/source@148 8346c931-da38-4b9b-9d4c-e48b93cbd075 --- CrawlerProcess/main.cpp | 2 -- CrawlerProcess/scrawler.cpp | 54 ++++++++++++++++++++++++++++++--- CrawlerProcess/scrawler.h | 4 ++- CrawlerProcess/scrawlerdata.cpp | 12 +++++++- CrawlerProcess/scrawlerdata.h | 5 ++- 5 files changed, 65 insertions(+), 12 deletions(-) diff --git a/CrawlerProcess/main.cpp b/CrawlerProcess/main.cpp index 3d3e29a..440ccd2 100644 --- a/CrawlerProcess/main.cpp +++ b/CrawlerProcess/main.cpp @@ -15,8 +15,6 @@ int main(int argc, char *argv[]) a.setApplicationName(QString("Chrome")); a.setApplicationVersion(QString("39.0.2171.95")); - - QSqlDatabase db = QSqlDatabase::addDatabase("QMYSQL"); db.setHostName("bigbird.iptime.org"); db.setUserName("admin"); diff --git a/CrawlerProcess/scrawler.cpp b/CrawlerProcess/scrawler.cpp index f104fb9..9cc637a 100644 --- a/CrawlerProcess/scrawler.cpp +++ b/CrawlerProcess/scrawler.cpp @@ -16,7 +16,7 @@ struct SProxyList SCrawler::SCrawler():QObject() { m_page = new QWebPage; - connect(m_page, SIGNAL(loadFinished(bool)), this, SLOT(saveResult(bool))); + connect(m_page, SIGNAL(loadFinished(bool)), this, SLOT(saveResult(bool))); } SCrawler::~SCrawler() @@ -29,6 +29,13 @@ void SCrawler::load(QStringList _strlistArgv) if (_strlistArgv[0] == "naver") { + if (_strlistArgv[1] == "news_list") + { + m_strUrl = _strlistArgv[2]; + m_nSelect = E_NAVER_NEWS_LIST; + setProxy(); + } + if (_strlistArgv[1] == "cafe_list") { m_strUrl = _strlistArgv[2]; @@ -66,7 +73,9 @@ void SCrawler::load(QStringList _strlistArgv) m_strUrl = _strlistArgv[2]; m_nSelect = E_NAVER_BLOG_REPLY; } - m_strTable = "data_" + _strlistArgv[3]; + + if (_strlistArgv.size() > 3) + m_strTable = "data_" + _strlistArgv[3]; } @@ -162,6 +171,7 @@ void SCrawler::saveResult(bool ok) switch(m_nSelect) { + case E_NAVER_NEWS_LIST:saveFrameNewsList(m_page->mainFrame());break; case E_NAVER_CAFE_LIST:saveFrameCafeList(m_page->mainFrame());break; case E_NAVER_CAFE_DATA: { @@ -200,6 +210,7 @@ void SCrawler::saveResult(bool ok) case E_NAVER_BLOG_LIST: case E_DAUM_CAFE_LIST: case E_DAUM_BLOG_LIST: + case E_NAVER_NEWS_LIST: if (m_bError) { cout << "block";// block @@ -294,7 +305,6 @@ void SCrawler::saveFrameList(QWebFrame *frame) } QWebElement eleMain = Find(frame->documentElement(),"div","class","blog section _blogBase"); - QSqlQuery sql; QStringList urlList; for (int i = 0; i < 10 ; i++) @@ -1133,12 +1143,11 @@ void SCrawler::saveFrameDaumCafeList(QWebFrame *frame) { if (m_bUse == true) return; - static int cz = 0; + ///static int cz = 0; // Debug(frame->frameName() + QString::number(cz++) + ".html",frame->toHtml()); //int nLast = 0; QStringList urlList; QWebElement eleMain = Find(frame->documentElement(),"div","class","type_fulltext wid_f"); - foreach(QWebElement eleSub,eleMain.findAll("div")) { if (eleSub.attribute("class") == "wrap_cont") @@ -1564,6 +1573,41 @@ void SCrawler::saveFrameDaumBlogUrl(QWebFrame *frame){} void SCrawler::saveFrameDaumBlogComment(QWebFrame *frame){} void SCrawler::saveFrameDaumBlogList(QWebFrame *frame){} +void SCrawler::saveFrameNewsList(QWebFrame *frame) +{ + if (m_bUse == true) return; + QWebElement eleMain = Find(frame->documentElement(),"div","class","srch_result_area headline"); + foreach(QWebElement eleSub,eleMain.findAll("div")) + { + if (eleSub.attribute("class") == QString("info")) + { + QString str = Find(eleSub,"a","class","go_naver").attribute("href"); + if (str.trimmed().isEmpty()) continue; + if (str.contains("http://sports")) continue; + cout << "o " << str.toStdString() << endl; + } + } + QStringList strTotal = bodydata.GetNumber(Find(frame->documentElement(),"span","class","result_num").toPlainText().trimmed()); + QVector vecTotal; + foreach(QString str,strTotal) + { + if (str.trimmed().isEmpty() == false) + { + vecTotal.push_back(str.toInt()); + } + } + if (vecTotal.size() == 3) + { + if (vecTotal[0] >= vecTotal[1]) + m_bLast = true; + if (vecTotal[1] == vecTotal[2]) + m_bLast = true; + } + else + m_bError = true; + m_bUse = true; +} + QWebElement SCrawler::Find(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind) { QWebElementCollection elements = _FindElement.findAll(_strElement); diff --git a/CrawlerProcess/scrawler.h b/CrawlerProcess/scrawler.h index 6e560ef..c56f1a1 100644 --- a/CrawlerProcess/scrawler.h +++ b/CrawlerProcess/scrawler.h @@ -18,7 +18,8 @@ public: E_DAUM_CAFE_DATA, E_DAUM_BLOG_LIST, E_DAUM_BLOG_BODY, - E_DAUM_BLOG_REPLY + E_DAUM_BLOG_REPLY, + E_NAVER_NEWS_LIST }; public: SCrawler(); @@ -61,6 +62,7 @@ private: void saveFrameDaumBlogUrl(QWebFrame *frame); void saveFrameDaumBlogComment(QWebFrame *frame); void saveFrameDaumCafeUrl(QWebFrame *frame); + void saveFrameNewsList(QWebFrame *frame); int GetNumber(QString _str); bool getProxyList(QString &_str); void setProxy(); diff --git a/CrawlerProcess/scrawlerdata.cpp b/CrawlerProcess/scrawlerdata.cpp index 793c755..764d96a 100644 --- a/CrawlerProcess/scrawlerdata.cpp +++ b/CrawlerProcess/scrawlerdata.cpp @@ -102,7 +102,6 @@ bool SCrawlerData::sendDB() cout << "error : " << query.lastError().text().toStdString(); return false; } - return true; } @@ -123,4 +122,15 @@ QString SCrawlerData::GetSafeUtf(QString _strData) return str; } +QStringList SCrawlerData::GetNumber(QString _str) +{ + QString str; + QChar *pch = _str.data(); + for (int i = 0; i < _str.length(); i++) + { + if (pch[i].isNumber() || pch[i].isSpace()) + str += pch[i]; + } + return str.trimmed().split(" "); +} diff --git a/CrawlerProcess/scrawlerdata.h b/CrawlerProcess/scrawlerdata.h index a7bd365..09962e8 100644 --- a/CrawlerProcess/scrawlerdata.h +++ b/CrawlerProcess/scrawlerdata.h @@ -38,13 +38,12 @@ private: private: QString GetSafeUtf(QString _strData); - QString getTable(); - - + QString getTable(); public: SCrawlerData(); ~SCrawlerData(); + QStringList GetNumber(QString _str); QString getData(int _num); void setData(QString _str, int _num); void clear();