Files
clients/EffectProcess/snaverblog.cpp
admin 9b479f3e9a effectprocess 추가
git-svn-id: svn://192.168.0.12/source@306 8346c931-da38-4b9b-9d4c-e48b93cbd075
2016-10-28 10:10:05 +00:00

632 lines
22 KiB
C++
Raw Blame History

#include "snaverblog.h"
#include <QUrl>
#include <QWebFrame>
#include <QTimer>
#include <QWebElement>
#include <QWebElementCollection>
#include <QChar>
namespace
{
const int TIME = 3000;//3 second
}
QWebElement Find(const QWebElement& _FindElement, const QString& _strElement = "",
const QString& _strAttrib = "" , const QString& _strFind = "")
{
QWebElementCollection elements = _FindElement.findAll(_strElement);
foreach (const QWebElement& element, elements)
{
if (element.attribute(_strAttrib) == _strFind)
{
return element;
}
}
QWebElement element;
return element;
}
QString GetSafeUtf(const QString& _strData)
{
QString str;
const QChar *pch = _strData.data();
for (int i = 0; i < _strData.length(); i++)
{
if (pch[i].unicode() >= 12593 && pch[i].unicode() <= 12622)
str += pch[i];
if (pch[i].unicode() >= 44032 && pch[i].unicode() <= 55203)
str += pch[i];
if (pch[i].isDigit() || pch[i].isNumber() || pch[i].isSpace() || pch[i].isLower() || pch[i].isUpper() || pch[i].isSymbol() )
str += pch[i];
}
return str;
}
class BoolController
{
public:
explicit BoolController(bool& _b):m_b(_b)
{
m_b = true;
}
~BoolController()
{
m_b = false;
}
private:
bool &m_b;
};
void unlock(QMutex* _pm)
{
_pm->unlock();
}
class Lock
{
public:
explicit Lock(QMutex* _pm):m_pMutex(_pm, unlock)
{
_pm->lock();
}
private:
std::shared_ptr<QMutex> m_pMutex;
};
SNaverBlog::SNaverBlog(std::shared_ptr<SWebPage> _page):m_pWebPage(_page),
m_eMode(E_CRAWL_MODE::BODY), m_bReplyProcessed(false), m_bBodyProcessed(false),
//m_bReplyProcessing(false), m_bBodyProcessing(false),
m_pNetworkRequest(new QNetworkRequest), m_nBodyRetry(3)
{
if (!_page)
m_pWebPage = std::make_shared<SWebPage>(new SWebPage);
initConnect();
}
void SNaverBlog::initConnect()
{
QObject::connect(&(*m_pWebPage), &SWebPage::loadFinished,
this, &SNaverBlog::slotLoadFinished);
QObject::connect(&(*m_pWebPage), &SWebPage::signalAlert,
this, &SNaverBlog::slotAlert);
QObject::connect(&(*m_pWebPage), &SWebPage::loadProgress,
[](int n){ qDebug() << n; });
QObject::connect(&(*m_pWebPage), &SWebPage::loadStarted,
[](){ qDebug() << "loadstart"; });
m_pWebPage->settings()->setAttribute(QWebSettings::AutoLoadImages, false);
}
void SNaverBlog::slotAlert(const QString& msg)
{
emit signalError(E_ERROR_CODE::DELETED_URL_ERROR, msg);
}
void SNaverBlog::go(const QString &_url, E_CRAWL_MODE _mode)
{
qDebug() << _url;
m_eMode = _mode;
QUrl url(_url.trimmed());
if (_mode == E_CRAWL_MODE::BODY)
m_strUrl = _url.trimmed();
if (url.scheme().isEmpty())
url.setScheme("http");
QNetworkRequest requests;
requests.setUrl(url);
requests.setRawHeader("Accept-Language",
"ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4,zh-CN;q=0.2,zh;q=0.2");
m_pWebPage->mainFrame()->load(requests);
/*
m_pNetworkRequest->setUrl(url);
m_pNetworkRequest->setRawHeader(
"Accept-Language",
"ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4,zh-CN;q=0.2,zh;q=0.2"
);
m_pWebPage->mainFrame()->load(*m_pNetworkRequest);
*/
}
void SNaverBlog::slotLoadFinished(bool ok)
{
qDebug() << "slotLoadFinished";
if (!ok)
{
signalError(E_ERROR_CODE::UNKNOWN_ERROR, "UnkownError/BlockError/ProxyError");
return;
}
qDebug() << "slotLoadFinished";
switch(m_eMode)
{
case E_CRAWL_MODE::BODY:
{
Lock lock(&m_mutexBody);
if (m_bBodyProcessed)
return;
qDebug() << "body loadfinished";
m_efData.body = getBody();
if (!check(m_efData.body) && m_nBodyRetry-- > 0)
{
//QTimer::singleShot(TIME, this, SLOT(slotLoadFinished(bool)));
QTimer::singleShot(TIME, [&](){ slotLoadFinished(true);});
return;
}
qDebug() << m_efData.body.data[ARTICLE_ID];
qDebug() << m_efData.body.data[ARTICLE_URL];
qDebug() << m_efData.body.data[ARTICLE_PROFILE];
qDebug() << m_efData.body.data[ARTICLE_NICKNAME];
qDebug() << m_efData.body.data[ARTICLE_HIT];
qDebug() << m_efData.body.data[REPLY_URL];
qDebug() << m_efData.body.data[ARTICLE_TITLE];
m_bBodyProcessed = true;
qDebug() << "body loadfinished";
qDebug() << makeReplyUrl(m_strUrl);
go(makeReplyUrl(m_strUrl), E_CRAWL_MODE::REPLY);
return;
}
case E_CRAWL_MODE::REPLY:
{
Lock lock(&m_mutexReply);
if (m_bReplyProcessed)
return;
qDebug() << "reply loadfinished";
m_efData.reply = getReply();
m_bReplyProcessed = true;
break;
}
}
emit signalDataOk(m_efData);
}
/*
void SNaverBlog::slotLoadFinished(bool ok)
{
if (!ok)
{
signalError(E_ERROR_CODE::UNKNOWN_ERROR, "UnkownError/BlockError/ProxyError");
return;
}
switch(m_eMode)
{
case E_CRAWL_MODE::BODY:
{
if (m_bBodyProcessing || m_bBodyProcessed)
return;
BoolController(m_bBodyProcessing);
m_efData.body = getBody();
if (!check(m_efData.body) && m_nBodyRetry-- > 0)
{
QTimer::singleShot(TIME, this, &SNaverBlog::slotLoadFinished);
return;
}
m_bBodyProcessed = true;
break;
}
case E_CRAWL_MODE::REPLY:
{
if (m_bReplyProcessed || m_bReplyProcessing)
return;
BoolController(m_bBodyProcessing);
m_efData.reply = getReply();
m_bReplyProcessed = true;
break;
}
}
emit signalDataOk();
}
*/
DataForm SNaverBlog::getBody()
{
DataForm data;
getBody(m_pWebPage->mainFrame(), data);
return data;
}
bool SNaverBlog::check(const DataForm &_data)
{
if (_data.data[ARTICLE_PROFILE].isEmpty() || _data.data[REPLY_URL].isEmpty())
return false;
else
return true;
}
bool SNaverBlog::check(const QVector<DataForm> &_data)
{
return true;
}
QVector<DataForm> SNaverBlog::getReply()
{
QVector<DataForm> data;
QWebFrame* frame = m_pWebPage->mainFrame();
QWebElement group = Find(frame->documentElement(),"ul","id","commentList");
QWebElementCollection elements = group.findAll("li");
QString strParent,strDate,strNick,strComm,strUrl,strId;
QStringList strList = m_strUrl.split("/");
QString strCommUrl;
for (int i=0; i < strList.size() - 1; i++)
strUrl += strList.at(i) + "/";
{
int nStartIdPos = m_strUrl.indexOf("blogId=") + QString("blogId=").size();
int nEndIdPos = m_strUrl.indexOf('&', nStartIdPos);
strUrl += m_strUrl.mid(nStartIdPos, nEndIdPos - nStartIdPos) + '/';
}
{
int nStartIdPos = m_strUrl.indexOf("logNo=") + QString("logNo=").size();
int nEndIdPos = m_strUrl.indexOf('&', nStartIdPos);
strUrl += m_strUrl.mid(nStartIdPos, nEndIdPos - nStartIdPos);
}
int nCount=0;
foreach (QWebElement element, elements)
{
if (element.attribute("class") == "_countableComment ")
{
strNick = strParent = GetSafeUtf(Find(element,"a","class","nick pcol2").toPlainText());
strDate = Find(element,"span","class","date fil5 pcol2").toPlainText();
strComm = Find(element,"dd","class","comm pcol2").toPlainText();
strCommUrl = Find(element,"a","class","nick pcol2").attribute("href");
if(strCommUrl.left(QString("http://blog.naver.com").length()).compare("http://blog.naver.com") == 0)
strId = strCommUrl.split("/").at(3).trimmed();
if(strCommUrl.right(QString("blog.me").length()).compare("blog.me") == 0)
strId = strCommUrl.split("/").at(2).split(".").at(0).trimmed();
if(strCommUrl.left(1) == "/")
{
QStringList strList = strCommUrl.split("&");
foreach(QString str, strList)
{
if(str.left(3) == "id=")
{
strId = str.right(str.length() - 3);
}
}
}
strComm = GetSafeUtf(strComm);
if (strComm.isEmpty()== false)
{
strComm.replace("'","\\'");
strComm.replace("\"","\\\"");
strComm = strComm.trimmed();
int nStartIdPos = m_strUrl.indexOf("blogId=") + QString("blogId=").size();
int nEndIdPos = m_strUrl.indexOf('&', nStartIdPos);
DataForm form;
form.data[ARTICLE_URL] = strUrl.toUtf8();
form.data[ARTICLE_ID] = strId.trimmed().toUtf8();
form.data[PLATFORM_ID] = m_strUrl.mid(nStartIdPos, nEndIdPos - nStartIdPos);
form.data[ARTICLE_NICKNAME] = strNick.toUtf8();
form.data[ARTICLE_DATA] = strComm.toUtf8();
form.data[ARTICLE_DATE] = strDate.toUtf8();
form.data[REPLY_URL] = m_strUrl.toUtf8();
form.data[ARTICLE_ORDER] = QString::number(nCount++);
data.append(form);
/*
query.bindValue(":URL", strUrl.toUtf8());
query.bindValue(":PLATFORMID",m_strUrl.mid(nStartIdPos, nEndIdPos - nStartIdPos));
query.bindValue(":NICK",strNick.toUtf8());
query.bindValue(":DATA",strComm.toUtf8());
query.bindValue(":DATE",strDate.toUtf8());
query.bindValue(":URLREPLY",m_strUrl.toUtf8());
query.bindValue(":ROWNUM",(nCount++));
if (query.exec()==false)
cout << "error : " << query.lastError().text().toStdString();
*/
}
}
if (element.attribute("class") == "reply _countableComment ")
{
strNick = Find(element,"a","class","nick pcol2").toPlainText();
strDate = Find(element,"span","class","date fil5 pcol2").toPlainText();
QWebElement subElement = Find(element,"dd","class","comm pcol2");
QString subNick = Find(subElement,"a","class","nick pcol2").toPlainText();
strComm = subElement.toPlainText();
strCommUrl = Find(element,"a","class","nick pcol2").attribute("href");
if(strCommUrl.left(QString("http://blog.naver.com").length()).compare("http://blog.naver.com") == 0)
strId = strCommUrl.split("/").at(3).trimmed();
if(strCommUrl.right(QString("blog.me").length()).compare("blog.me") == 0)
strId = strCommUrl.split("/").at(2).split(".").at(0).trimmed();
if(strCommUrl.left(1) == "/")
{
QStringList strList = strCommUrl.split("&");
foreach(QString str, strList)
{
if(str.left(3) == "id=")
{
strId = str.right(str.length() - 3);
}
}
}
if(subNick.isEmpty() == false)
strComm = strComm.right(strComm.size()-subNick.size()-1);
if (strComm.isEmpty() == false)
{
strComm = GetSafeUtf(strComm);
strComm.replace("'","\\'");
strComm.replace("\"","\\\"");
strComm = strComm.trimmed();
int nStartIdPos = m_strUrl.indexOf("blogId=") + QString("blogId=").size();
int nEndIdPos = m_strUrl.indexOf('&', nStartIdPos);
DataForm form;
form.data[ARTICLE_ID] = strId;
form.data[ARTICLE_URL] = strUrl.toUtf8();
form.data[PLATFORM_ID] = m_strUrl.mid(nStartIdPos, nEndIdPos - nStartIdPos);
form.data[ARTICLE_NICKNAME] = strNick.toUtf8();
form.data[ARTICLE_DATA] = strComm.toUtf8();
form.data[ARTICLE_DATE] = strDate.toUtf8();
form.data[ARTICLE_PARENT] = strParent.toUtf8();
form.data[REPLY_URL] = m_strUrl.toUtf8();
form.data[ARTICLE_ORDER] = QString::number(nCount++);
data.append(form);
/*
query.bindValue(":URL",strUrl.toUtf8());
query.bindValue(":PLATFORMID",m_strUrl.mid(nStartIdPos, nEndIdPos - nStartIdPos));
query.bindValue(":NICK",strNick.toUtf8());
query.bindValue(":DATA",strComm.toUtf8());
query.bindValue(":DATE",strDate.toUtf8());
query.bindValue(":PARENT",strParent.toUtf8());
query.bindValue(":URLREPLY",m_strUrl.toUtf8());
query.bindValue(":ROWNUM",(nCount++));
if (query.exec()==false)
{
cout << "error : " << query.lastError().text().toStdString();
}
*/
}
}
}
return data;
}
void SNaverBlog::getBody(QWebFrame *frame, DataForm &_data)
{
if (frame->frameName().compare(QString("BuddyConnectIframe")) == 0)
{
QWebElement profile = Find(frame->documentElement(),"div","class","profile_name");
QString str = profile.toPlainText().split("\n").at(0);
if (str.isEmpty() == false)
_data.data[ARTICLE_URL] = m_strUrl;
}
if (frame->frameName().compare(QString("mainFrame")) == 0)
{
QString sympathy;
QString numofReply;
QString strProfile;
QWebElement proTitle = Find(frame->documentElement(),"meta","property","og:article:author");
_data.data[PLATFORM_TITLE] = proTitle.attribute("content").split("|").at(1).trimmed();
if(_data.data[PLATFORM_TITLE].length() > 0)
_data.data[PLATFORM_TITLE] = GetSafeUtf(_data.data[PLATFORM_TITLE]);
else
{
proTitle = Find(frame->documentElement(),"span","id","blogTitleName");
_data.data[PLATFORM_TITLE] = GetSafeUtf(proTitle.toPlainText().trimmed());
}
QWebElement image;
QWebElement profile = Find(frame->documentElement(),"div","id","blog-profile");
{
QWebElement nick = Find(profile,"strong","id","nickNameArea");
if (nick.toPlainText().isEmpty()==false)
_data.data[ARTICLE_NICKNAME] = nick.toPlainText();
if(_data.data[ARTICLE_NICKNAME].isEmpty())
{
QString strHtml = frame->toHtml();
QString strFind = "var nickName = '";
int start = strHtml.indexOf(strFind);
if (start == -1)
{
//cout << "error : nick name can not find and next again connect." << endl;
;
}
if (strHtml.at(start + strFind.length()) == QChar('\''))
{
//cout << "error : nick name can not find and next again connect." << endl;
;
}
else
{
int end = strHtml.indexOf("'",start + strFind.length());
_data.data[ARTICLE_NICKNAME] = strHtml.mid(start + strFind.length(),end-start-strFind.length());
}
}
_data.data[ARTICLE_NICKNAME] = GetSafeUtf(_data.data[ARTICLE_NICKNAME]);
if (m_strUrl.split("/").at(3).trimmed() == _data.data[ARTICLE_NICKNAME].trimmed())
{
_data.data[ARTICLE_ID] = _data.data[ARTICLE_NICKNAME];
}
else
{
if (_data.data[ARTICLE_ID].isEmpty())
{
if((m_strUrl.split("/").at(2).compare("blog.naver.com") == 0))
_data.data[ARTICLE_ID] = m_strUrl.split("/").at(3);
else
_data.data[ARTICLE_ID] = m_strUrl.split("/").at(2).split(".").at(0);
}
}
if(_data.data[ARTICLE_NICKNAME].length() == 0)
_data.data[ARTICLE_NICKNAME] = _data.data[ARTICLE_ID];
image = Find(profile,"img","alt","<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20>̹<EFBFBD><CCB9><EFBFBD>");
strProfile = Find(profile, "p", "class", "caption align").toPlainText().trimmed();
}
{
QWebElement post = Find(frame->documentElement(),"div","id","postListBody");
{
QWebElement weCmt = post.findFirst("a[class^='pcol2 _cmtList']");
if (!weCmt.isNull())
{
numofReply = weCmt.toPlainText().replace(",", "").trimmed();
numofReply = numofReply.replace(QRegExp("[\\D]"), "");
}
}
QWebElement post_top = Find(post,"table","class","post-top");
{
QWebElement title = Find(post_top,"span","class","pcol1 itemSubjectBoldfont");
if(title.isNull())
{
title = Find(frame->documentElement(), "div", "class", "se_textView");
}
if(title.isNull())
{
title = Find(frame->documentElement(), "h3", "class", "se_textarea");
}
if (title.toPlainText().isEmpty()==false)
{
_data.data[ARTICLE_TITLE] = title.toPlainText();
_data.data[ARTICLE_TITLE] = GetSafeUtf(_data.data[ARTICLE_TITLE]);
}
}
{
QWebElement date = Find(post_top,"p","class","date fil5 pcol2 _postAddDate");
if(date.isNull())
{
date = Find(frame->documentElement(), "span","class","se_publishDate pcol2 fil5");
}
_data.data[ARTICLE_DATE] = date.toPlainText().trimmed().replace("/","-");
if ( _data.data[ARTICLE_DATE].isEmpty() == false)
{
_data.data[ARTICLE_DATE] += ":00";
}
}
{
//QWebElement body = Find(post,"div","class","post-view pcol2 _param(1)");
QWebElement body = post.findFirst("div[class^='post-view pcol2 _param(1)']");
if(body.isNull())
body = post.findFirst("div[class*='pcol2 _param(1)']");
if(body.isNull())
body = Find(post, "class", "se_component_wrap sect_dsc __se_component_area");
if (body.toPlainText().isEmpty()==false)
{
_data.data[ARTICLE_DATA] = body.toPlainText();
_data.data[ARTICLE_DATA] = GetSafeUtf(_data.data[ARTICLE_DATA]);
}
}
{
QWebElement WEsympathy = Find(frame->documentElement(),"div","class","btn_like pcol2");
if(WEsympathy.isNull())
{
sympathy = "0";
}
else
{
sympathy = WEsympathy.toPlainText().trimmed();
}
//qDebug() << "Sympathy: " << sympathy;
//qDebug() << strProfile;
}
{
//retry if profile is empty and sympathy is empty
/*
if((strProfile.isEmpty() || sympathy.isEmpty()) && (m_nRetryCount < RETRY_MAX))
{
m_nRetryCount++;
qDebug() << m_nRetryCount;
QTimer::singleShot(RETRY_INTERVAL, this, SLOT(reloadPage()));
return false;
}
*/
}
}
//QString strHead[E_DATA_MAX] = {"article_nickname","article_id","article_title","article_date","article_data","platform_title"};
if(image.attribute("src").trimmed().length() != 0)
{
_data.data[ARTICLE_PROFILEURL] = image.attribute("src").trimmed();
}
strProfile = GetSafeUtf(strProfile);
if(strProfile.length() > 0)
{
_data.data[ARTICLE_PROFILE] = strProfile.trimmed();
}
_data.data[REPLY_URL] = sympathy;
_data.data[ARTICLE_ORDER] = numofReply;
}
foreach(QWebFrame *childFrame, frame->childFrames())
getBody(childFrame, _data);
}
QString SNaverBlog::makeReplyUrl(const QString& _url)
{
QStringList strList = _url.split("/");
QString strOut = "";
try
{
if(strList.at(2).compare("blog.naver.com") == 0)
{
//strOut = _strUrl;
strOut = "http://blog.naver.com";
strOut += "/CommentList.nhn?blogId=";
strOut += strList.at(3);
strOut += "&logNo=";
strOut += strList.at(4);
strOut += "&currentPage=&isMemolog=false&focusingCommentNo=&showLastPage=true&shortestContentAreaWidth=false";
}
else //id.blog.me
{
strOut = "http://blog.naver.com/";
//strOut += strList.at(2).split(".").at(0);
//strOut += "/";
//strOut += strList.at(3);
strOut += "/CommentList.nhn?blogId=";
strOut += strList.at(2).split(".").at(0);
strOut += "&logNo=";
strOut += strList.at(3);
strOut += "&currentPage=&isMemolog=false&focusingCommentNo=&showLastPage=true&shortestContentAreaWidth=false";
}
}
catch (...)
{
emit signalError(E_ERROR_CODE::OUT_DATED_CRAWLER, "Check ReplyUrl\n" + strOut);
}
return strOut;
}