Files
clients/EffectProcess/snaverblog.cpp

861 lines
25 KiB
C++

#include "snaverblog.h"
#include <QUrl>
#include <QTimer>
#include <QChar>
#include <functional>
#include <QWebEngineSettings>
#include <QFile>
#include <QStringList>
#include <QString>
namespace
{
const int TIME = 3000;//3 second
}
using std::placeholders::_1;
const QString SNaverBlog::scriptArticleDate =
"function findArticledate() {"
"var tt = frames['mainFrame'].document.querySelector('p.date.fil5.pcol2._postAddDate'); "
"if(!tt) "
" tt = frames['mainFrame'].document.querySelector('span.se_publishDate.pcol2.fil5'); "
"return tt.innerText;"
"} "
"findArticledate();";
const QString SNaverBlog::scriptArticleTitle =
"function findArticletitle() "
"{ var tt = frames['mainFrame'].document.querySelector('span.pcol1.itemSubjectBoldfont');"
"if(tt)"
" return tt.textContent; "
"else "
" return \"\"; }; "
"findArticletitle();";
const QString SNaverBlog::scriptArticleData =
"function findArticledata()"
"{"
" var tt = frames['mainFrame'].document.querySelector('div.post-view.pcol2');"
" if (tt)"
" return tt.innerText;"
" else"
" {"
" var aa = frames['mainFrame'].document.querySelector("
"'div.se_component_wrap.sect_dsc.__se_component_area');"
" if (aa)"
" return aa.innerText;"
" else"
" return '';"
" }"
"}"
"findArticledata();";
const QString SNaverBlog::scriptLikeCount =
"function findSympathy() { "
"var tt = frames['mainFrame'].document.querySelector('em.u_cnt._cnt'); "
"if (tt)"
" return tt.textContent;"
"else"
" return \"-1\"; "
"}"
"findSympathy();";
const QString SNaverBlog::scriptReplyUrl =
"function findSympathy() { "
"var tt = frames['mainFrame'].document.querySelector('em.u_cnt._cnt'); "
"if (tt)"
" return tt.textContent;"
"else"
" return \"-1\"; "
"}"
"findSympathy();";
const QString SNaverBlog::scriptArticleNickname =
"function findNickname() "
"{"
"var tt = frames['mainFrame'].document.querySelector(\"meta[property='naverblog:nickname']\");"
"if (tt)"
" return tt.getAttribute('content');"
"else"
" return \"\";"
"}"
"findNickname();";
const QString SNaverBlog::scriptArticleOrder =
"function findReplyNum()"
"{"
"var tt = frames['mainFrame'].document.querySelector('a.pcol2._cmtList');"
"if (tt)"
" return tt.textContent;"
"else"
" return \"0\";"
"}"
"findReplyNum();";
const QString SNaverBlog::scriptReply =
"function getReply()"
"{"
" var result = [];"
" var ul = document.querySelector('ul#commentList');"
" var lis = ul.querySelectorAll('li'); "
" var rowCount = 0;"
" var strParent = '';"
" for (var i = 0; i < lis.length; ++i)"
" {"
" var att = lis[i].getAttribute('class');"
" if (att == '_countableComment ')"
" {"
" var Result = [];"
" var strNickname = '';"
" var strDate = '';"
" var strId = ''; "
" var strComm = '';"
" "
" var eleNickname = lis[i].querySelector('a.nick.pcol2');"
" if (eleNickname)"
" {"
" strNickname = strParent = eleNickname.textContent;"
" }"
" else"
" {"
" strNickname = strParent = '';"
" }"
""
" var eleDate = lis[i].querySelector('span.date.fil5.pcol2');"
" if (eleDate)"
" {"
" strDate = eleDate.textContent;"
" }"
" else"
" {"
" strDate = '1990-01-01 00:00';"
" }"
""
" var eleComm = lis[i].querySelector('a.nick.pcol2');"
" if (eleComm)"
" {"
" var strHref = eleComm.getAttribute('href');"
" if (strHref.substr(0, 21) == 'http://blog.naver.com')"
" {"
" strId = strHref.split('/')[3];"
" }"
" else if (strHref.substr(strHref.length - 7, 7) == 'blog.me')"
" {"
" strId = strHref.split('/')[2].split('.')[0];"
" }"
" else if (strHref.substr(0, 1) == '/')"
" {"
" var strList = strHref.split('&');"
" for (var j = 0; j < strList.length; ++j)"
" {"
" if (strList[j].substr(0, 3) == 'id=')"
" {"
" strId = strList[j].substr(3, strList[j].length - 3);"
" }"
" }"
" }"
" }"
" else"
" {"
" strId = '';"
" }"
" var eleComment = lis[i].querySelector('dd.comm.pcol2');"
" if (eleComment)"
" {"
" strComm = eleComment.innerText;"
" }"
" else"
" {"
" strComm = '';"
" }"
" var subResult = [];"
" subResult.push(strId);"
" subResult.push(strNickname);"
" subResult.push(strDate);"
" subResult.push(strComm);"
" subResult.push(rowCount.toString());"
" subResult.push('-1');"
" result.push(subResult);"
" rowCount = rowCount + 1;"
" }"
""
" else if (att == 'reply _countableComment ')"
" {"
" var Result = [];"
" var subNickname = '';"
" var strNickname = '';"
" var strDate = '';"
" var strId = ''; "
" var strComm = '';"
""
""
" var eleNickname = lis[i].querySelector('a.nick.pcol2');"
" if (eleNickname)"
" {"
" strNickname = strParent = eleNickname.textContent;"
" }"
" else"
" {"
" strNickname = strParent = '';"
" }"
""
" var eleDate = lis[i].querySelector('span.date.fil5.pcol2');"
" if (eleDate)"
" {"
" strDate = eleDate.textContent;"
" }"
" else"
" {"
" strDate = '1990-01-01 00:00';"
" }"
""
" var eleComm = lis[i].querySelector('a.nick.pcol2');"
" if (eleComm)"
" {"
" var strHref = eleComm.getAttribute('href');"
" if (strHref.substr(0, 21) == 'http://blog.naver.com')"
" {"
" strId = strHref.split('/')[3];"
" }"
" else if (strHref.substr(strHref.length - 7, 7) == 'blog.me')"
" {"
" strId = strHref.split('/')[2].split('.')[0];"
" }"
" else if (strHref.substr(0, 1) == '/')"
" {"
" var strList = strHref.split('&');"
" for (var j = 0; j < strList.length; ++j)"
" {"
" if (strList[j].substr(0, 3) == 'id=')"
" {"
" strId = strList[j].substr(3, strList[j].length - 3);"
" }"
" }"
" }"
" }"
" else"
" {"
" strId = '';"
" }"
" var eleComment = lis[i].querySelector('dd.comm.pcol2');"
" if (eleComment)"
" {"
" strComm = eleComment.innerText;"
" }"
" else"
" {"
" strComm = '';"
" }"
""
" var eleSub = lis[i].querySelector('dd.comm.pcol2');"
" var subNick = '';"
" if (eleSub)"
" {"
" subNick = eleSub.querySelector('a.nick.pcol2');"
" }"
" else"
" {"
" subNick = '';"
" }"
" var strSubNick = '';"
""
" if (subNick)"
" {"
" strSubNick = subNick.textContent;"
" }"
" "
" strComm = strComm.substr(strSubNick.length, strComm.length - strSubNick.length);"
" var subResult = [];"
" subResult.push(strId);"
" subResult.push(strNickname);"
" subResult.push(strDate);"
" subResult.push(strComm);"
" subResult.push(rowCount.toString());"
" subResult.push(strParent);"
" result.push(subResult);"
" rowCount = rowCount + 1;"
" } "
" }"
" return result;"
"}"
"getReply();";
const QString SNaverBlog::scriptLike =
"function getLike()"
"{"
" var result = [];"
" var datemap = new Map();"
" var likeTableBody = document.querySelector('#comment>table>tbody');"
" var likeTrs = likeTableBody.querySelectorAll('tr');"
" for (var i=0; i<likeTrs.length; i++)"
" {"
" var span = likeTrs[i].querySelector('th>span');"
" var rawdate = span.textContent;"
" var date = rawdate.split(' ')[0];"
""
" if (datemap.get(date) == undefined)"
" {"
" datemap.set(date, 1);"
" }"
" else"
" {"
" datemap.set(date, datemap.get(date)+1);"
" }"
" }"
" datemap.forEach(function (item, key) {"
" var datecount = [key, item];"
" result.push(datecount);"
" });"
""
" return result;"
"}"
"getLike();";
QString GetSafeUtf(const QString& _strData)
{
QString str;
const QChar *pch = _strData.data();
for (int i = 0; i < _strData.length(); i++)
{
if (pch[i].unicode() >= 12593 && pch[i].unicode() <= 12622)
str += pch[i];
if (pch[i].unicode() >= 44032 && pch[i].unicode() <= 55203)
str += pch[i];
if (pch[i].isDigit() || pch[i].isNumber() || pch[i].isSpace() || pch[i].isLower() || pch[i].isUpper() || pch[i].isSymbol() )
str += pch[i];
}
return str;
}
class BoolController
{
public:
explicit BoolController(bool& _b):m_b(_b)
{
m_b = true;
}
~BoolController()
{
m_b = false;
}
private:
bool &m_b;
};
void unlock(QMutex* _pm)
{
_pm->unlock();
}
class Lock
{
public:
explicit Lock(QMutex* _pm):m_pMutex(_pm, unlock)
{
_pm->lock();
}
private:
std::shared_ptr<QMutex> m_pMutex;
};
SNaverBlog::SNaverBlog(std::shared_ptr<SWebPage> _page):m_pWebPage(_page),
m_eMode(E_CRAWL_MODE::BODY), m_bReplyProcessed(false), m_bBodyProcessed(false),
//m_bReplyProcessing(false), m_bBodyProcessing(false),
m_pNetworkRequest(new QNetworkRequest), m_nBodyRetry(3)
{
if (!_page)
m_pWebPage = std::make_shared<SWebPage>(new SWebPage(this));
for (int i = 0; i < E_FUNC_MAX; ++i)
m_abOk[i] = false;
initConnect();
}
void SNaverBlog::initConnect()
{
QObject::connect(&(*m_pWebPage), &SWebPage::loadFinished,
this, &SNaverBlog::slotLoadFinished);
QObject::connect(&(*m_pWebPage), &SWebPage::signalAlert,
this, &SNaverBlog::slotAlert);
/*
QObject::connect(&(*m_pWebPage), &SWebPage::loadProgress,
[](int n){ qDebug() << n; });
QObject::connect(&(*m_pWebPage), &SWebPage::loadStarted,
[](){ qDebug() << "loadstart"; });
*/
m_pWebPage->settings()->setAttribute(QWebEngineSettings::AutoLoadImages, false);
}
QString SNaverBlog::changeUrl(const QString& _url)
{
QStringList strList = _url.split("/");
QString strOut = "";
if ((strList.size() > 3) && strList.at(2).contains("blog.me")) //id.blog.me
{
strOut = "http://blog.naver.com/";
strOut += strList.at(2).split(".").at(0);
strOut += "/";
strOut += strList.at(3);
return strOut;
}
else
return _url;
}
void SNaverBlog::slotAlert(const QString& msg)
{
emit signalError(E_ERROR_CODE::DELETED_URL_ERROR, msg.toUtf8());
}
void SNaverBlog::go(const QString &_url, E_CRAWL_MODE _mode)
{
//qDebug() << _url;
m_eMode = _mode;
QUrl url(changeUrl(_url));
if (_mode == E_CRAWL_MODE::BODY)
m_strUrl = _url.trimmed();
/*
if (url.scheme().isEmpty())
url.setScheme("http");
QNetworkRequest requests;
requests.setUrl(url);
requests.setRawHeader("Accept-Language",
"ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4,zh-CN;q=0.2,zh;q=0.2");
*/
//m_pWebPage->load(requests);
m_pWebPage->load(url);
/*
m_pNetworkRequest->setUrl(url);
m_pNetworkRequest->setRawHeader(
"Accept-Language",
"ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4,zh-CN;q=0.2,zh;q=0.2"
);
m_pWebPage->mainFrame()->load(*m_pNetworkRequest);
*/
}
void SNaverBlog::slotLoadFinished(bool ok)
{
qDebug() << "slotLoadFinished";
if (!ok)
{
signalError(E_ERROR_CODE::UNKNOWN_ERROR, "UnkownError/BlockError/ProxyError");
return;
}
qDebug() << "slotLoadFinished";
switch(m_eMode)
{
case E_CRAWL_MODE::BODY:
{
if (m_bBodyProcessed)
return;
qDebug() << "body loadfinished";
m_pWebPage->runJavaScript(scriptArticleDate, std::bind(&SNaverBlog::cbBodyArticleDate,this, _1));
m_pWebPage->runJavaScript(scriptArticleData, std::bind(&SNaverBlog::cbBodyArticleData,this, _1));
m_pWebPage->runJavaScript(scriptArticleNickname, std::bind(&SNaverBlog::cbBodyArticleNickname, this, _1));
m_pWebPage->runJavaScript(scriptArticleOrder, std::bind(&SNaverBlog::cbBodyArticleOrder,this, _1));
m_pWebPage->runJavaScript(scriptArticleTitle, std::bind(&SNaverBlog::cbBodyArticleTitle,this, _1));
//m_pWebPage->runJavaScript(scriptReplyUrl, std::bind(&SNaverBlog::cbBodyReplyUrl,this, _1));
m_pWebPage->runJavaScript(scriptLikeCount, std::bind(&SNaverBlog::cbBodyReplyUrl,this, _1));
return;
}
case E_CRAWL_MODE::REPLY:
{
if (m_bReplyProcessed)
return;
qDebug() << "reply loadfinished";
m_pWebPage->runJavaScript(scriptReply, std::bind(&SNaverBlog::cbReply, this, _1));
/*
m_pWebPage->toHtml([](const QString &str){
QFile file("reply.html");
if (!file.open(QIODevice::WriteOnly | QIODevice::Text | QIODevice::Append))
return;
QTextStream out(&file);
out << str << "\n";
file.close();
});
*/
break;
}
case E_CRAWL_MODE::LIKE:
{
m_pWebPage->runJavaScript(scriptLike, std::bind(&SNaverBlog::cbLike,this, _1));
break;
}
}
//emit signalDataOk(m_efData);
}
/*
void SNaverBlog::slotLoadFinished(bool ok)
{
if (!ok)
{
signalError(E_ERROR_CODE::UNKNOWN_ERROR, "UnkownError/BlockError/ProxyError");
return;
}
switch(m_eMode)
{
case E_CRAWL_MODE::BODY:
{
if (m_bBodyProcessing || m_bBodyProcessed)
return;
BoolController(m_bBodyProcessing);
m_efData.body = getBody();
if (!check(m_efData.body) && m_nBodyRetry-- > 0)
{
QTimer::singleShot(TIME, this, &SNaverBlog::slotLoadFinished);
return;
}
m_bBodyProcessed = true;
break;
}
case E_CRAWL_MODE::REPLY:
{
if (m_bReplyProcessed || m_bReplyProcessing)
return;
BoolController(m_bBodyProcessing);
m_efData.reply = getReply();
m_bReplyProcessed = true;
break;
}
}
emit signalDataOk();
}
*/
/*
DataForm SNaverBlog::getBody()
{
DataForm data;
getBody(m_pWebPage->mainFrame(), data);
return data;
}
*/
bool SNaverBlog::check(const DataForm &_data)
{
if (_data.data[ARTICLE_NICKNAME].isEmpty() &&
(_data.data[ARTICLE_DATE] == "1990-01-01 00:00:00" ||
_data.data[ARTICLE_DATE].isEmpty()))
return false;
else
return true;
}
bool SNaverBlog::check(const QVector<DataForm> &_data)
{
return true;
}
QString SNaverBlog::makeReplyUrl(const QString& _url)
{
QStringList strList = _url.split("/");
QString strOut = "";
if((strList.size() > 4) && (strList.at(2).compare("blog.naver.com") == 0))
{
//strOut = _strUrl;
strOut = "http://blog.naver.com";
strOut += "/CommentList.nhn?blogId=";
strOut += strList.at(3);
strOut += "&logNo=";
strOut += strList.at(4);
strOut += "&currentPage=&isMemolog=false&focusingCommentNo=&showLastPage=true&shortestContentAreaWidth=false";
}
else if ((strList.size() > 3) && strList.at(2).contains("blog.me")) //id.blog.me
{
strOut = "http://blog.naver.com/";
//strOut += strList.at(2).split(".").at(0);
//strOut += "/";
//strOut += strList.at(3);
strOut += "/CommentList.nhn?blogId=";
strOut += strList.at(2).split(".").at(0);
strOut += "&logNo=";
strOut += strList.at(3);
strOut += "&currentPage=&isMemolog=false&focusingCommentNo=&showLastPage=true&shortestContentAreaWidth=false";
}
else
emit signalError(E_ERROR_CODE::OUT_DATED_CRAWLER, "Check Body and ReplyUrl\n" + strOut);
return strOut;
}
QString SNaverBlog::makeLikeUrl(const QString& _url)
{
QStringList strList = _url.split("/");
QString strOut = "";
if(strList.size() < 4)
emit signalError(E_ERROR_CODE::OUT_DATED_CRAWLER, "Check Body and LikeUrl\n");
//url example : http://blog.naver.com/SympathyHistoryList.nhn?blogId=yewonerang&logNo=220984900374
strOut += "http://blog.naver.com/SympathyHistoryList.nhn?blogId=";
strOut += strList.at(3);
strOut += "&logNo=";
strOut += strList.at(4);
return strOut;
}
void SNaverBlog::cbBodyArticleDate(const QVariant& _result)
{
//qDebug() << "articledate";
bool ok;
//static int Retry = 2;
QString strDate = _result.toString();
strDate = strDate.trimmed().replace("/", "-").replace(".", "-").replace("- "," ").replace("T", " ");
if (!strDate.isEmpty())
strDate += ":00";
else
strDate = "1990-01-01 00:00:00";
/*
qDebug() << "Body Article Date : " << strDate;
if (strDate.trimmed().isEmpty() && (Retry-- > 0))
{
QTimer::singleShot(TIME, this, [this](){
this->m_pWebPage->runJavaScript(scriptArticleDate, std::bind(&SNaverBlog::cbBodyArticleDate, this, _1));
});
return;
}
*/
m_efData.body.data[ARTICLE_DATE] = strDate;
ok = doneBodyCrawler(E_FUNC_ARTICLE_DATE);
//qDebug() << "done articledate";
if (ok)
goReplyUrl();
}
void SNaverBlog::cbBodyArticleData(const QVariant& _result)
{
//qDebug() << "articledata";
bool ok;
//static int Retry = 2;
QString strData = _result.toString();
strData = GetSafeUtf(strData.trimmed());
m_efData.body.data[ARTICLE_DATA] = strData;
ok = doneBodyCrawler(E_FUNC_ARTICLE_DATA);
//qDebug() << "done articledata";
if (ok)
goReplyUrl();
}
void SNaverBlog::cbBodyArticleTitle(const QVariant& _result)
{
//qDebug() << "articletitle";
bool ok;
//static int Retry = 2;
QString strData = _result.toString();
strData = GetSafeUtf(strData.trimmed());
m_efData.body.data[ARTICLE_TITLE] = strData;
ok = doneBodyCrawler(E_FUNC_ARTICLE_TITLE);
//qDebug() << "done articletitle";
if (ok)
goReplyUrl();
}
void SNaverBlog::cbBodyArticleNickname(const QVariant& _result)
{
//qDebug() << "articlenickname";
bool ok;
//static int Retry = 2;
QString strData = _result.toString();
strData = GetSafeUtf(strData.trimmed());
m_efData.body.data[ARTICLE_NICKNAME] = strData;
ok = doneBodyCrawler(E_FUNC_ARTICLE_NICKNAME);
//qDebug() << "done articlenickname";
if (ok)
goReplyUrl();
}
void SNaverBlog::cbBodyArticleOrder(const QVariant& _result)
{
//qDebug() << "articleorder";
bool ok;
//static int Retry = 2;
QString strData = _result.toString();
strData = strData.trimmed().replace(",","");
strData = strData.replace(QRegExp("[\\D]+"), "");
m_efData.body.data[ARTICLE_ORDER] = strData;
ok = doneBodyCrawler(E_FUNC_ARTICLE_ORDER);
//qDebug() << "done articleorder";
if (ok)
goReplyUrl();
}
void SNaverBlog::cbBodyReplyUrl(const QVariant& _result)
{
//qDebug() << "replyurl";
bool ok;
static int Retry = 2;
QString strData = _result.toString().trimmed().replace(",", "");
if ((strData == "-1" || strData.isEmpty()) && Retry-- > 0)
{
QTimer::singleShot(TIME, this, [this](){
this->m_pWebPage->runJavaScript(scriptReplyUrl, std::bind(&SNaverBlog::cbBodyReplyUrl, this, _1));
});
return;
}
m_efData.body.data[REPLY_URL] = strData;
ok = doneBodyCrawler(E_FUNC_REPLY_URL);
//qDebug() << "done replyurl";
if (ok)
goReplyUrl();
}
void SNaverBlog::cbBodyLikeCount(const QVariant& _result)
{
//qDebug() << "replyurl";
bool ok;
static int Retry = 2;
QString strData = _result.toString().trimmed().replace(",", "");
if ((strData == "-1" || strData.isEmpty()) && Retry-- > 0)
{
QTimer::singleShot(TIME, this, [this](){
this->m_pWebPage->runJavaScript(scriptLikeCount, std::bind(&SNaverBlog::cbBodyLikeCount, this, _1));
});
return;
}
m_efData.body.data[LIKE_COUNT] = strData;
ok = doneBodyCrawler(E_FUNC_REPLY_URL);
//qDebug() << "done replyurl";
if (ok)
goReplyUrl();
}
bool SNaverBlog::doneBodyCrawler(E_FUNC _func_type)
{
bool ok = true;
Lock lock(&m_mutexBody);
m_abOk[_func_type] = true;
for (int i = 0; i < E_FUNC_MAX; ++i)
{
ok &= m_abOk[i];
}
return ok;
}
void SNaverBlog::goReplyUrl()
{
m_bBodyProcessed = true;
/*
qDebug() << m_efData.body.data[ARTICLE_TITLE];
qDebug() << m_efData.body.data[REPLY_URL];
qDebug() << m_efData.body.data[ARTICLE_NICKNAME];
qDebug() << m_efData.body.data[ARTICLE_DATE];
qDebug() << m_efData.body.data[ARTICLE_DATA];
qDebug() << m_efData.body.data[ARTICLE_ORDER];
*/
if (!check(m_efData.body))
{
emit signalError(E_ERROR_CODE::BLOCK_ERROR, "Block or Check naver");
return;
}
go(makeReplyUrl(m_strUrl), E_CRAWL_MODE::REPLY);
}
void SNaverBlog::goLikeUrl()
{
m_bBodyProcessed = true;
if (!check(m_efData.body))
{
emit signalError(E_ERROR_CODE::BLOCK_ERROR, "Block or Check naver");
return;
}
go(makeLikeUrl(m_strUrl), E_CRAWL_MODE::LIKE);
}
void SNaverBlog::cbReply(const QVariant& _result)
{
//id, nickname, date, commment, order, strParent;
if (_result.isValid() && !_result.isNull())
{
QList<QVariant> results = _result.toList();
foreach (auto &result, results)
{
QStringList slResult = result.toStringList();
if (slResult.size() == 6)
{
DataForm form;
form.data[ARTICLE_ID] = slResult.at(0).trimmed();
form.data[ARTICLE_NICKNAME] = GetSafeUtf(slResult.at(1).trimmed());
form.data[ARTICLE_DATE] = slResult.at(2).trimmed().replace(".", "-").replace("/", "-").replace("T", " ").
replace("- ", " ") + ":00";
form.data[ARTICLE_DATA] = GetSafeUtf(slResult.at(3).trimmed());
form.data[ARTICLE_ORDER] = slResult.at(4).trimmed();
form.data[ARTICLE_PARENT] = (slResult.at(5).trimmed() == "-1") ? "" : slResult.at(5).trimmed();
m_efData.reply.append(form);
}
}
}
/*
qDebug() << "cbReply";
qDebug() << m_efData.reply.size();
*/
// emit signalDataOk(m_efData);
goLikeUrl();
}
void SNaverBlog::cbLike(const QVariant& _result)
{
if (_result.isValid() && !_result.isNull())
{
QList<QVariant> results = _result.toList();
foreach (auto &result, results)
{
QStringList slResult = result.toStringList();
QString date = ((QString)slResult.at(0)).replace(".", "");
m_efData.like[date] = ((QString)slResult.at(1)).toInt();
}
}
emit signalDataOk(m_efData);
}