네이버 블로그 크롤링 오류시 timer를 통해 webpage 읽어오도록 수정

git-svn-id: svn://192.168.0.12/source@217 8346c931-da38-4b9b-9d4c-e48b93cbd075
This commit is contained in:
admin
2015-10-28 08:15:34 +00:00
parent ec55f45ce4
commit b297d693d7
2 changed files with 54 additions and 9 deletions

View File

@@ -4,7 +4,10 @@
#include <QSqlError> #include <QSqlError>
#include <QByteArray> #include <QByteArray>
#include <QDebug> #include <QDebug>
#include <QTimer>
#define BLOG_RETRY_MAX 4
#define BLOG_RETRY_INTERVAL 3000
using namespace std; using namespace std;
struct SProxyList struct SProxyList
@@ -16,6 +19,7 @@ struct SProxyList
SCrawler::SCrawler():QObject() SCrawler::SCrawler():QObject()
{ {
m_page = new QWebPage; m_page = new QWebPage;
m_nBlogRetryCount = 0;
connect(m_page, SIGNAL(loadFinished(bool)), this, SLOT(saveResult(bool))); connect(m_page, SIGNAL(loadFinished(bool)), this, SLOT(saveResult(bool)));
} }
@@ -182,7 +186,8 @@ void SCrawler::saveResult(bool ok)
case E_NAVER_BLOG_LIST:saveFrameList(m_page->mainFrame());break; case E_NAVER_BLOG_LIST:saveFrameList(m_page->mainFrame());break;
case E_NAVER_BLOG_BODY: case E_NAVER_BLOG_BODY:
{ {
saveFrameUrl(m_page->mainFrame()); if(!saveFrameUrl(m_page->mainFrame()))
return;
bodydata.sendDB(); bodydata.sendDB();
break; break;
} }
@@ -477,12 +482,12 @@ enum E_DATA
E_DATA_MAX, E_DATA_MAX,
}; };
void SCrawler::saveFrameUrl(QWebFrame *frame) bool SCrawler::saveFrameUrl(QWebFrame *frame)
{ {
static int cz = 0; //static int cz = 0;
// Debug(frame->frameName() + QString::number(cz++) + ".html",frame->toHtml()); // Debug(frame->frameName() + QString::number(cz++) + ".html",frame->toHtml());
QSqlQuery sql; //QSqlQuery sql;
if (frame->frameName().compare(QString("BuddyConnectIframe")) == 0) if (frame->frameName().compare(QString("BuddyConnectIframe")) == 0)
{ {
QWebElement profile = Find(frame->documentElement(),"div","class","profile_name"); QWebElement profile = Find(frame->documentElement(),"div","class","profile_name");
@@ -493,6 +498,7 @@ void SCrawler::saveFrameUrl(QWebFrame *frame)
if (frame->frameName().compare(QString("mainFrame")) == 0) if (frame->frameName().compare(QString("mainFrame")) == 0)
{ {
QString str[E_DATA_MAX]; QString str[E_DATA_MAX];
QString sympathy;
QString strProfile; QString strProfile;
QWebElement proTitle = Find(frame->documentElement(),"meta","property","og:article:author"); QWebElement proTitle = Find(frame->documentElement(),"meta","property","og:article:author");
str[E_DATA_PLATFORM_TITLE] = proTitle.attribute("content").split("|").at(1).trimmed(); str[E_DATA_PLATFORM_TITLE] = proTitle.attribute("content").split("|").at(1).trimmed();
@@ -585,9 +591,37 @@ void SCrawler::saveFrameUrl(QWebFrame *frame)
str[E_DATA_DATA] = GetSafeUtf(str[E_DATA_DATA]); str[E_DATA_DATA] = GetSafeUtf(str[E_DATA_DATA]);
} }
} }
{
QWebElement WEsympathy = Find(frame->documentElement(),"div","class","btn_like pcol2");
if(WEsympathy.isNull())
{
sympathy = "0";
}
else
{
sympathy = WEsympathy.toPlainText().trimmed();
}
//qDebug() << "Sympathy: " << sympathy;
//qDebug() << strProfile;
}
{
//retry if profile is empty and sympathy is empty
if(strProfile.isEmpty() || sympathy.isEmpty() && (m_nBlogRetryCount < BLOG_RETRY_MAX))
{
m_nBlogRetryCount++;
QTimer::singleShot(BLOG_RETRY_INTERVAL, this, SLOT(crawlBlog()));
return false;
}
}
} }
//QString strHead[E_DATA_MAX] = {"article_nickname","article_id","article_title","article_date","article_data","platform_title"}; //QString strHead[E_DATA_MAX] = {"article_nickname","article_id","article_title","article_date","article_data","platform_title"};
bodydata.setData(str[0].trimmed(), bodydata.ARTICLE_NICKNAME); bodydata.setData(str[0].trimmed(), bodydata.ARTICLE_NICKNAME);
bodydata.setData(str[1].trimmed(), bodydata.ARTICLE_ID); bodydata.setData(str[1].trimmed(), bodydata.ARTICLE_ID);
@@ -608,6 +642,8 @@ void SCrawler::saveFrameUrl(QWebFrame *frame)
bodydata.setData(m_strUrl, bodydata.ARTICLE_URL); bodydata.setData(m_strUrl, bodydata.ARTICLE_URL);
bodydata.setTable(m_strTable); bodydata.setTable(m_strTable);
bodydata.setData(sympathy, bodydata.ARTICLE_HIT);
bodydata.setData("naver", bodydata.PLATFORM_NAME); bodydata.setData("naver", bodydata.PLATFORM_NAME);
bodydata.setData("blog", bodydata.PLATFORM_FORM); bodydata.setData("blog", bodydata.PLATFORM_FORM);
bodydata.setData("body", bodydata.ARTICLE_FORM); bodydata.setData("body", bodydata.ARTICLE_FORM);
@@ -616,8 +652,16 @@ void SCrawler::saveFrameUrl(QWebFrame *frame)
} }
bool b_ok = true;
foreach(QWebFrame *childFrame, frame->childFrames()) foreach(QWebFrame *childFrame, frame->childFrames())
saveFrameUrl(childFrame); b_ok = (b_ok && saveFrameUrl(childFrame));
return b_ok;
}
void SCrawler::crawlBlog()
{
saveResult(true);
} }
void SCrawler::saveFrameComment(QWebFrame *frame) void SCrawler::saveFrameComment(QWebFrame *frame)

View File

@@ -31,7 +31,8 @@ signals:
void finished(); void finished();
private slots: private slots:
void saveResult(bool ok); void saveResult(bool ok);
private: void crawlBlog();
private:
int m_nSelect; int m_nSelect;
QString m_strReper; QString m_strReper;
QString m_strKeywordID; QString m_strKeywordID;
@@ -47,15 +48,14 @@ private:
bool m_bLast; bool m_bLast;
bool m_bError; bool m_bError;
bool m_bNothing; bool m_bNothing;
QString m_strProxyIP; QString m_strProxyIP;
int m_nProxyPort; int m_nProxyPort;
int m_nBlogRetryCount;
QString SqlString(QString _str); QString SqlString(QString _str);
QString GetSafeUtf(QString _strData); QString GetSafeUtf(QString _strData);
void saveFrameList(QWebFrame *frame); void saveFrameList(QWebFrame *frame);
void saveFrameCafeList(QWebFrame *frame); void saveFrameCafeList(QWebFrame *frame);
void saveFrameUrl(QWebFrame *frame); bool saveFrameUrl(QWebFrame *frame);
void saveFrameComment(QWebFrame *frame); void saveFrameComment(QWebFrame *frame);
void saveFrameCafeUrl(QWebFrame *frame); void saveFrameCafeUrl(QWebFrame *frame);
void saveFrameDaumBlogList(QWebFrame *frame); void saveFrameDaumBlogList(QWebFrame *frame);
@@ -64,6 +64,7 @@ private:
void saveFrameDaumBlogComment(QWebFrame *frame); void saveFrameDaumBlogComment(QWebFrame *frame);
void saveFrameDaumCafeUrl(QWebFrame *frame); void saveFrameDaumCafeUrl(QWebFrame *frame);
void saveFrameNewsList(QWebFrame *frame); void saveFrameNewsList(QWebFrame *frame);
int GetNumber(QString _str); int GetNumber(QString _str);
bool getProxyList(QString &_str); bool getProxyList(QString &_str);
void setProxy(); void setProxy();