From b297d693d7bb1dcad7f13f48a93b7cc2c4c0fc94 Mon Sep 17 00:00:00 2001 From: admin Date: Wed, 28 Oct 2015 08:15:34 +0000 Subject: [PATCH] =?UTF-8?q?=EB=84=A4=EC=9D=B4=EB=B2=84=20=EB=B8=94?= =?UTF-8?q?=EB=A1=9C=EA=B7=B8=20=ED=81=AC=EB=A1=A4=EB=A7=81=20=EC=98=A4?= =?UTF-8?q?=EB=A5=98=EC=8B=9C=20timer=EB=A5=BC=20=ED=86=B5=ED=95=B4=20webp?= =?UTF-8?q?age=20=EC=9D=BD=EC=96=B4=EC=98=A4=EB=8F=84=EB=A1=9D=20=EC=88=98?= =?UTF-8?q?=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit git-svn-id: svn://192.168.0.12/source@217 8346c931-da38-4b9b-9d4c-e48b93cbd075 --- CrawlerProcess/scrawler.cpp | 54 +++++++++++++++++++++++++++++++++---- CrawlerProcess/scrawler.h | 9 ++++--- 2 files changed, 54 insertions(+), 9 deletions(-) diff --git a/CrawlerProcess/scrawler.cpp b/CrawlerProcess/scrawler.cpp index df27a8f..f8efc2d 100644 --- a/CrawlerProcess/scrawler.cpp +++ b/CrawlerProcess/scrawler.cpp @@ -4,7 +4,10 @@ #include #include #include +#include +#define BLOG_RETRY_MAX 4 +#define BLOG_RETRY_INTERVAL 3000 using namespace std; struct SProxyList @@ -16,6 +19,7 @@ struct SProxyList SCrawler::SCrawler():QObject() { m_page = new QWebPage; + m_nBlogRetryCount = 0; connect(m_page, SIGNAL(loadFinished(bool)), this, SLOT(saveResult(bool))); } @@ -182,7 +186,8 @@ void SCrawler::saveResult(bool ok) case E_NAVER_BLOG_LIST:saveFrameList(m_page->mainFrame());break; case E_NAVER_BLOG_BODY: { - saveFrameUrl(m_page->mainFrame()); + if(!saveFrameUrl(m_page->mainFrame())) + return; bodydata.sendDB(); break; } @@ -477,12 +482,12 @@ enum E_DATA E_DATA_MAX, }; -void SCrawler::saveFrameUrl(QWebFrame *frame) +bool SCrawler::saveFrameUrl(QWebFrame *frame) { - static int cz = 0; + //static int cz = 0; // Debug(frame->frameName() + QString::number(cz++) + ".html",frame->toHtml()); - QSqlQuery sql; + //QSqlQuery sql; if (frame->frameName().compare(QString("BuddyConnectIframe")) == 0) { QWebElement profile = Find(frame->documentElement(),"div","class","profile_name"); @@ -493,6 +498,7 @@ void SCrawler::saveFrameUrl(QWebFrame *frame) if (frame->frameName().compare(QString("mainFrame")) == 0) { QString str[E_DATA_MAX]; + QString sympathy; QString strProfile; QWebElement proTitle = Find(frame->documentElement(),"meta","property","og:article:author"); str[E_DATA_PLATFORM_TITLE] = proTitle.attribute("content").split("|").at(1).trimmed(); @@ -585,9 +591,37 @@ void SCrawler::saveFrameUrl(QWebFrame *frame) str[E_DATA_DATA] = GetSafeUtf(str[E_DATA_DATA]); } } + { + + QWebElement WEsympathy = Find(frame->documentElement(),"div","class","btn_like pcol2"); + + if(WEsympathy.isNull()) + { + sympathy = "0"; + } + else + { + sympathy = WEsympathy.toPlainText().trimmed(); + } + //qDebug() << "Sympathy: " << sympathy; + //qDebug() << strProfile; + + } + { + + + //retry if profile is empty and sympathy is empty + if(strProfile.isEmpty() || sympathy.isEmpty() && (m_nBlogRetryCount < BLOG_RETRY_MAX)) + { + m_nBlogRetryCount++; + QTimer::singleShot(BLOG_RETRY_INTERVAL, this, SLOT(crawlBlog())); + return false; + } + } } + //QString strHead[E_DATA_MAX] = {"article_nickname","article_id","article_title","article_date","article_data","platform_title"}; bodydata.setData(str[0].trimmed(), bodydata.ARTICLE_NICKNAME); bodydata.setData(str[1].trimmed(), bodydata.ARTICLE_ID); @@ -608,6 +642,8 @@ void SCrawler::saveFrameUrl(QWebFrame *frame) bodydata.setData(m_strUrl, bodydata.ARTICLE_URL); bodydata.setTable(m_strTable); + bodydata.setData(sympathy, bodydata.ARTICLE_HIT); + bodydata.setData("naver", bodydata.PLATFORM_NAME); bodydata.setData("blog", bodydata.PLATFORM_FORM); bodydata.setData("body", bodydata.ARTICLE_FORM); @@ -616,8 +652,16 @@ void SCrawler::saveFrameUrl(QWebFrame *frame) } + bool b_ok = true; foreach(QWebFrame *childFrame, frame->childFrames()) - saveFrameUrl(childFrame); + b_ok = (b_ok && saveFrameUrl(childFrame)); + + return b_ok; +} + +void SCrawler::crawlBlog() +{ + saveResult(true); } void SCrawler::saveFrameComment(QWebFrame *frame) diff --git a/CrawlerProcess/scrawler.h b/CrawlerProcess/scrawler.h index ccbe680..1f3c071 100644 --- a/CrawlerProcess/scrawler.h +++ b/CrawlerProcess/scrawler.h @@ -31,7 +31,8 @@ signals: void finished(); private slots: void saveResult(bool ok); -private: + void crawlBlog(); +private: int m_nSelect; QString m_strReper; QString m_strKeywordID; @@ -47,15 +48,14 @@ private: bool m_bLast; bool m_bError; bool m_bNothing; - QString m_strProxyIP; int m_nProxyPort; - + int m_nBlogRetryCount; QString SqlString(QString _str); QString GetSafeUtf(QString _strData); void saveFrameList(QWebFrame *frame); void saveFrameCafeList(QWebFrame *frame); - void saveFrameUrl(QWebFrame *frame); + bool saveFrameUrl(QWebFrame *frame); void saveFrameComment(QWebFrame *frame); void saveFrameCafeUrl(QWebFrame *frame); void saveFrameDaumBlogList(QWebFrame *frame); @@ -64,6 +64,7 @@ private: void saveFrameDaumBlogComment(QWebFrame *frame); void saveFrameDaumCafeUrl(QWebFrame *frame); void saveFrameNewsList(QWebFrame *frame); + int GetNumber(QString _str); bool getProxyList(QString &_str); void setProxy();