네이버 블로그 크롤링 오류시 timer를 통해 webpage 읽어오도록 수정
git-svn-id: svn://192.168.0.12/source@217 8346c931-da38-4b9b-9d4c-e48b93cbd075
This commit is contained in:
@@ -4,7 +4,10 @@
|
|||||||
#include <QSqlError>
|
#include <QSqlError>
|
||||||
#include <QByteArray>
|
#include <QByteArray>
|
||||||
#include <QDebug>
|
#include <QDebug>
|
||||||
|
#include <QTimer>
|
||||||
|
|
||||||
|
#define BLOG_RETRY_MAX 4
|
||||||
|
#define BLOG_RETRY_INTERVAL 3000
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
struct SProxyList
|
struct SProxyList
|
||||||
@@ -16,6 +19,7 @@ struct SProxyList
|
|||||||
SCrawler::SCrawler():QObject()
|
SCrawler::SCrawler():QObject()
|
||||||
{
|
{
|
||||||
m_page = new QWebPage;
|
m_page = new QWebPage;
|
||||||
|
m_nBlogRetryCount = 0;
|
||||||
connect(m_page, SIGNAL(loadFinished(bool)), this, SLOT(saveResult(bool)));
|
connect(m_page, SIGNAL(loadFinished(bool)), this, SLOT(saveResult(bool)));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -182,7 +186,8 @@ void SCrawler::saveResult(bool ok)
|
|||||||
case E_NAVER_BLOG_LIST:saveFrameList(m_page->mainFrame());break;
|
case E_NAVER_BLOG_LIST:saveFrameList(m_page->mainFrame());break;
|
||||||
case E_NAVER_BLOG_BODY:
|
case E_NAVER_BLOG_BODY:
|
||||||
{
|
{
|
||||||
saveFrameUrl(m_page->mainFrame());
|
if(!saveFrameUrl(m_page->mainFrame()))
|
||||||
|
return;
|
||||||
bodydata.sendDB();
|
bodydata.sendDB();
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@@ -477,12 +482,12 @@ enum E_DATA
|
|||||||
E_DATA_MAX,
|
E_DATA_MAX,
|
||||||
};
|
};
|
||||||
|
|
||||||
void SCrawler::saveFrameUrl(QWebFrame *frame)
|
bool SCrawler::saveFrameUrl(QWebFrame *frame)
|
||||||
{
|
{
|
||||||
static int cz = 0;
|
//static int cz = 0;
|
||||||
// Debug(frame->frameName() + QString::number(cz++) + ".html",frame->toHtml());
|
// Debug(frame->frameName() + QString::number(cz++) + ".html",frame->toHtml());
|
||||||
|
|
||||||
QSqlQuery sql;
|
//QSqlQuery sql;
|
||||||
if (frame->frameName().compare(QString("BuddyConnectIframe")) == 0)
|
if (frame->frameName().compare(QString("BuddyConnectIframe")) == 0)
|
||||||
{
|
{
|
||||||
QWebElement profile = Find(frame->documentElement(),"div","class","profile_name");
|
QWebElement profile = Find(frame->documentElement(),"div","class","profile_name");
|
||||||
@@ -493,6 +498,7 @@ void SCrawler::saveFrameUrl(QWebFrame *frame)
|
|||||||
if (frame->frameName().compare(QString("mainFrame")) == 0)
|
if (frame->frameName().compare(QString("mainFrame")) == 0)
|
||||||
{
|
{
|
||||||
QString str[E_DATA_MAX];
|
QString str[E_DATA_MAX];
|
||||||
|
QString sympathy;
|
||||||
QString strProfile;
|
QString strProfile;
|
||||||
QWebElement proTitle = Find(frame->documentElement(),"meta","property","og:article:author");
|
QWebElement proTitle = Find(frame->documentElement(),"meta","property","og:article:author");
|
||||||
str[E_DATA_PLATFORM_TITLE] = proTitle.attribute("content").split("|").at(1).trimmed();
|
str[E_DATA_PLATFORM_TITLE] = proTitle.attribute("content").split("|").at(1).trimmed();
|
||||||
@@ -585,9 +591,37 @@ void SCrawler::saveFrameUrl(QWebFrame *frame)
|
|||||||
str[E_DATA_DATA] = GetSafeUtf(str[E_DATA_DATA]);
|
str[E_DATA_DATA] = GetSafeUtf(str[E_DATA_DATA]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
{
|
||||||
|
|
||||||
|
QWebElement WEsympathy = Find(frame->documentElement(),"div","class","btn_like pcol2");
|
||||||
|
|
||||||
|
if(WEsympathy.isNull())
|
||||||
|
{
|
||||||
|
sympathy = "0";
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
sympathy = WEsympathy.toPlainText().trimmed();
|
||||||
|
}
|
||||||
|
//qDebug() << "Sympathy: " << sympathy;
|
||||||
|
//qDebug() << strProfile;
|
||||||
|
|
||||||
|
}
|
||||||
|
{
|
||||||
|
|
||||||
|
|
||||||
|
//retry if profile is empty and sympathy is empty
|
||||||
|
if(strProfile.isEmpty() || sympathy.isEmpty() && (m_nBlogRetryCount < BLOG_RETRY_MAX))
|
||||||
|
{
|
||||||
|
m_nBlogRetryCount++;
|
||||||
|
QTimer::singleShot(BLOG_RETRY_INTERVAL, this, SLOT(crawlBlog()));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
//QString strHead[E_DATA_MAX] = {"article_nickname","article_id","article_title","article_date","article_data","platform_title"};
|
//QString strHead[E_DATA_MAX] = {"article_nickname","article_id","article_title","article_date","article_data","platform_title"};
|
||||||
bodydata.setData(str[0].trimmed(), bodydata.ARTICLE_NICKNAME);
|
bodydata.setData(str[0].trimmed(), bodydata.ARTICLE_NICKNAME);
|
||||||
bodydata.setData(str[1].trimmed(), bodydata.ARTICLE_ID);
|
bodydata.setData(str[1].trimmed(), bodydata.ARTICLE_ID);
|
||||||
@@ -608,6 +642,8 @@ void SCrawler::saveFrameUrl(QWebFrame *frame)
|
|||||||
bodydata.setData(m_strUrl, bodydata.ARTICLE_URL);
|
bodydata.setData(m_strUrl, bodydata.ARTICLE_URL);
|
||||||
bodydata.setTable(m_strTable);
|
bodydata.setTable(m_strTable);
|
||||||
|
|
||||||
|
bodydata.setData(sympathy, bodydata.ARTICLE_HIT);
|
||||||
|
|
||||||
bodydata.setData("naver", bodydata.PLATFORM_NAME);
|
bodydata.setData("naver", bodydata.PLATFORM_NAME);
|
||||||
bodydata.setData("blog", bodydata.PLATFORM_FORM);
|
bodydata.setData("blog", bodydata.PLATFORM_FORM);
|
||||||
bodydata.setData("body", bodydata.ARTICLE_FORM);
|
bodydata.setData("body", bodydata.ARTICLE_FORM);
|
||||||
@@ -616,8 +652,16 @@ void SCrawler::saveFrameUrl(QWebFrame *frame)
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool b_ok = true;
|
||||||
foreach(QWebFrame *childFrame, frame->childFrames())
|
foreach(QWebFrame *childFrame, frame->childFrames())
|
||||||
saveFrameUrl(childFrame);
|
b_ok = (b_ok && saveFrameUrl(childFrame));
|
||||||
|
|
||||||
|
return b_ok;
|
||||||
|
}
|
||||||
|
|
||||||
|
void SCrawler::crawlBlog()
|
||||||
|
{
|
||||||
|
saveResult(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
void SCrawler::saveFrameComment(QWebFrame *frame)
|
void SCrawler::saveFrameComment(QWebFrame *frame)
|
||||||
|
|||||||
@@ -31,7 +31,8 @@ signals:
|
|||||||
void finished();
|
void finished();
|
||||||
private slots:
|
private slots:
|
||||||
void saveResult(bool ok);
|
void saveResult(bool ok);
|
||||||
private:
|
void crawlBlog();
|
||||||
|
private:
|
||||||
int m_nSelect;
|
int m_nSelect;
|
||||||
QString m_strReper;
|
QString m_strReper;
|
||||||
QString m_strKeywordID;
|
QString m_strKeywordID;
|
||||||
@@ -47,15 +48,14 @@ private:
|
|||||||
bool m_bLast;
|
bool m_bLast;
|
||||||
bool m_bError;
|
bool m_bError;
|
||||||
bool m_bNothing;
|
bool m_bNothing;
|
||||||
|
|
||||||
QString m_strProxyIP;
|
QString m_strProxyIP;
|
||||||
int m_nProxyPort;
|
int m_nProxyPort;
|
||||||
|
int m_nBlogRetryCount;
|
||||||
QString SqlString(QString _str);
|
QString SqlString(QString _str);
|
||||||
QString GetSafeUtf(QString _strData);
|
QString GetSafeUtf(QString _strData);
|
||||||
void saveFrameList(QWebFrame *frame);
|
void saveFrameList(QWebFrame *frame);
|
||||||
void saveFrameCafeList(QWebFrame *frame);
|
void saveFrameCafeList(QWebFrame *frame);
|
||||||
void saveFrameUrl(QWebFrame *frame);
|
bool saveFrameUrl(QWebFrame *frame);
|
||||||
void saveFrameComment(QWebFrame *frame);
|
void saveFrameComment(QWebFrame *frame);
|
||||||
void saveFrameCafeUrl(QWebFrame *frame);
|
void saveFrameCafeUrl(QWebFrame *frame);
|
||||||
void saveFrameDaumBlogList(QWebFrame *frame);
|
void saveFrameDaumBlogList(QWebFrame *frame);
|
||||||
@@ -64,6 +64,7 @@ private:
|
|||||||
void saveFrameDaumBlogComment(QWebFrame *frame);
|
void saveFrameDaumBlogComment(QWebFrame *frame);
|
||||||
void saveFrameDaumCafeUrl(QWebFrame *frame);
|
void saveFrameDaumCafeUrl(QWebFrame *frame);
|
||||||
void saveFrameNewsList(QWebFrame *frame);
|
void saveFrameNewsList(QWebFrame *frame);
|
||||||
|
|
||||||
int GetNumber(QString _str);
|
int GetNumber(QString _str);
|
||||||
bool getProxyList(QString &_str);
|
bool getProxyList(QString &_str);
|
||||||
void setProxy();
|
void setProxy();
|
||||||
|
|||||||
Reference in New Issue
Block a user