#include "snaverblog.h" #include #include #include #include #include #include #include #include namespace { const int TIME = 3000;//3 second } using std::placeholders::_1; const QString SNaverBlog::scriptArticleDate = "function findArticledate() {" "var tt = frames['mainFrame'].document.querySelector('p.date.fil5.pcol2._postAddDate'); " "if(!tt) " " tt = frames['mainFrame'].document.querySelector('span.se_publishDate.pcol2.fil5'); " "return tt.innerText;" "} " "findArticledate();"; const QString SNaverBlog::scriptArticleTitle = "function findArticletitle() " "{ var tt = frames['mainFrame'].document.querySelector('span.pcol1.itemSubjectBoldfont');" "if(tt)" " return tt.textContent; " "else " " return \"\"; }; " "findArticletitle();"; const QString SNaverBlog::scriptArticleData = "function findArticledata()" "{" " var tt = frames['mainFrame'].document.querySelector('div.post-view.pcol2');" " if (tt)" " return tt.innerText;" " else" " {" " var aa = frames['mainFrame'].document.querySelector(" "'div.se_component_wrap.sect_dsc.__se_component_area');" " if (aa)" " return aa.innerText;" " else" " return '';" " }" "}" "findArticledata();"; const QString SNaverBlog::scriptLikeCount = "function findSympathy() { " "var tt = frames['mainFrame'].document.querySelector('em.u_cnt._cnt'); " "if (tt)" " return tt.textContent;" "else" " return \"-1\"; " "}" "findSympathy();"; const QString SNaverBlog::scriptReplyUrl = "function findSympathy() { " "var tt = frames['mainFrame'].document.querySelector('em.u_cnt._cnt'); " "if (tt)" " return tt.textContent;" "else" " return \"-1\"; " "}" "findSympathy();"; const QString SNaverBlog::scriptArticleNickname = "function findNickname() " "{" "var tt = frames['mainFrame'].document.querySelector(\"meta[property='naverblog:nickname']\");" "if (tt)" " return tt.getAttribute('content');" "else" " return \"\";" "}" "findNickname();"; const QString SNaverBlog::scriptArticleOrder = "function findReplyNum()" "{" "var tt = frames['mainFrame'].document.querySelector('a.pcol2._cmtList');" "if (tt)" " return tt.textContent;" "else" " return \"0\";" "}" "findReplyNum();"; const QString SNaverBlog::scriptReply = "function getReply()" "{" " var result = [];" " var ul = document.querySelector('ul#commentList');" " var lis = ul.querySelectorAll('li'); " " var rowCount = 0;" " var strParent = '';" " for (var i = 0; i < lis.length; ++i)" " {" " var att = lis[i].getAttribute('class');" " if (att == '_countableComment ')" " {" " var Result = [];" " var strNickname = '';" " var strDate = '';" " var strId = ''; " " var strComm = '';" " " " var eleNickname = lis[i].querySelector('a.nick.pcol2');" " if (eleNickname)" " {" " strNickname = strParent = eleNickname.textContent;" " }" " else" " {" " strNickname = strParent = '';" " }" "" " var eleDate = lis[i].querySelector('span.date.fil5.pcol2');" " if (eleDate)" " {" " strDate = eleDate.textContent;" " }" " else" " {" " strDate = '1990-01-01 00:00';" " }" "" " var eleComm = lis[i].querySelector('a.nick.pcol2');" " if (eleComm)" " {" " var strHref = eleComm.getAttribute('href');" " if (strHref.substr(0, 21) == 'http://blog.naver.com')" " {" " strId = strHref.split('/')[3];" " }" " else if (strHref.substr(strHref.length - 7, 7) == 'blog.me')" " {" " strId = strHref.split('/')[2].split('.')[0];" " }" " else if (strHref.substr(0, 1) == '/')" " {" " var strList = strHref.split('&');" " for (var j = 0; j < strList.length; ++j)" " {" " if (strList[j].substr(0, 3) == 'id=')" " {" " strId = strList[j].substr(3, strList[j].length - 3);" " }" " }" " }" " }" " else" " {" " strId = '';" " }" " var eleComment = lis[i].querySelector('dd.comm.pcol2');" " if (eleComment)" " {" " strComm = eleComment.innerText;" " }" " else" " {" " strComm = '';" " }" " var subResult = [];" " subResult.push(strId);" " subResult.push(strNickname);" " subResult.push(strDate);" " subResult.push(strComm);" " subResult.push(rowCount.toString());" " subResult.push('-1');" " result.push(subResult);" " rowCount = rowCount + 1;" " }" "" " else if (att == 'reply _countableComment ')" " {" " var Result = [];" " var subNickname = '';" " var strNickname = '';" " var strDate = '';" " var strId = ''; " " var strComm = '';" "" "" " var eleNickname = lis[i].querySelector('a.nick.pcol2');" " if (eleNickname)" " {" " strNickname = strParent = eleNickname.textContent;" " }" " else" " {" " strNickname = strParent = '';" " }" "" " var eleDate = lis[i].querySelector('span.date.fil5.pcol2');" " if (eleDate)" " {" " strDate = eleDate.textContent;" " }" " else" " {" " strDate = '1990-01-01 00:00';" " }" "" " var eleComm = lis[i].querySelector('a.nick.pcol2');" " if (eleComm)" " {" " var strHref = eleComm.getAttribute('href');" " if (strHref.substr(0, 21) == 'http://blog.naver.com')" " {" " strId = strHref.split('/')[3];" " }" " else if (strHref.substr(strHref.length - 7, 7) == 'blog.me')" " {" " strId = strHref.split('/')[2].split('.')[0];" " }" " else if (strHref.substr(0, 1) == '/')" " {" " var strList = strHref.split('&');" " for (var j = 0; j < strList.length; ++j)" " {" " if (strList[j].substr(0, 3) == 'id=')" " {" " strId = strList[j].substr(3, strList[j].length - 3);" " }" " }" " }" " }" " else" " {" " strId = '';" " }" " var eleComment = lis[i].querySelector('dd.comm.pcol2');" " if (eleComment)" " {" " strComm = eleComment.innerText;" " }" " else" " {" " strComm = '';" " }" "" " var eleSub = lis[i].querySelector('dd.comm.pcol2');" " var subNick = '';" " if (eleSub)" " {" " subNick = eleSub.querySelector('a.nick.pcol2');" " }" " else" " {" " subNick = '';" " }" " var strSubNick = '';" "" " if (subNick)" " {" " strSubNick = subNick.textContent;" " }" " " " strComm = strComm.substr(strSubNick.length, strComm.length - strSubNick.length);" " var subResult = [];" " subResult.push(strId);" " subResult.push(strNickname);" " subResult.push(strDate);" " subResult.push(strComm);" " subResult.push(rowCount.toString());" " subResult.push(strParent);" " result.push(subResult);" " rowCount = rowCount + 1;" " } " " }" " return result;" "}" "getReply();"; const QString SNaverBlog::scriptLike = "function getLike()" "{" " var result = [];" " var datemap = new Map();" " var likeTableBody = document.querySelector('#comment>table>tbody');" " var likeTrs = likeTableBody.querySelectorAll('tr');" " for (var i=0; ispan');" " var rawdate = span.textContent;" " var date = rawdate.split(' ')[0];" "" " if (datemap.get(date) == undefined)" " {" " datemap.set(date, 1);" " }" " else" " {" " datemap.set(date, datemap.get(date)+1);" " }" " }" " datemap.forEach(function (item, key) {" " var datecount = [key, item];" " result.push(datecount);" " });" "" " return result;" "}" "getLike();"; QString GetSafeUtf(const QString& _strData) { QString str; const QChar *pch = _strData.data(); for (int i = 0; i < _strData.length(); i++) { if (pch[i].unicode() >= 12593 && pch[i].unicode() <= 12622) str += pch[i]; if (pch[i].unicode() >= 44032 && pch[i].unicode() <= 55203) str += pch[i]; if (pch[i].isDigit() || pch[i].isNumber() || pch[i].isSpace() || pch[i].isLower() || pch[i].isUpper() || pch[i].isSymbol() ) str += pch[i]; } return str; } class BoolController { public: explicit BoolController(bool& _b):m_b(_b) { m_b = true; } ~BoolController() { m_b = false; } private: bool &m_b; }; void unlock(QMutex* _pm) { _pm->unlock(); } class Lock { public: explicit Lock(QMutex* _pm):m_pMutex(_pm, unlock) { _pm->lock(); } private: std::shared_ptr m_pMutex; }; SNaverBlog::SNaverBlog(std::shared_ptr _page):m_pWebPage(_page), m_eMode(E_CRAWL_MODE::BODY), m_bReplyProcessed(false), m_bBodyProcessed(false), //m_bReplyProcessing(false), m_bBodyProcessing(false), m_pNetworkRequest(new QNetworkRequest), m_nBodyRetry(3) { if (!_page) m_pWebPage = std::make_shared(new SWebPage(this)); for (int i = 0; i < E_FUNC_MAX; ++i) m_abOk[i] = false; initConnect(); } void SNaverBlog::initConnect() { QObject::connect(&(*m_pWebPage), &SWebPage::loadFinished, this, &SNaverBlog::slotLoadFinished); QObject::connect(&(*m_pWebPage), &SWebPage::signalAlert, this, &SNaverBlog::slotAlert); /* QObject::connect(&(*m_pWebPage), &SWebPage::loadProgress, [](int n){ qDebug() << n; }); QObject::connect(&(*m_pWebPage), &SWebPage::loadStarted, [](){ qDebug() << "loadstart"; }); */ m_pWebPage->settings()->setAttribute(QWebEngineSettings::AutoLoadImages, false); } QString SNaverBlog::changeUrl(const QString& _url) { QStringList strList = _url.split("/"); QString strOut = ""; if ((strList.size() > 3) && strList.at(2).contains("blog.me")) //id.blog.me { strOut = "http://blog.naver.com/"; strOut += strList.at(2).split(".").at(0); strOut += "/"; strOut += strList.at(3); return strOut; } else return _url; } void SNaverBlog::slotAlert(const QString& msg) { emit signalError(E_ERROR_CODE::DELETED_URL_ERROR, msg.toUtf8()); } void SNaverBlog::go(const QString &_url, E_CRAWL_MODE _mode) { //qDebug() << _url; m_eMode = _mode; QUrl url(changeUrl(_url)); if (_mode == E_CRAWL_MODE::BODY) m_strUrl = _url.trimmed(); /* if (url.scheme().isEmpty()) url.setScheme("http"); QNetworkRequest requests; requests.setUrl(url); requests.setRawHeader("Accept-Language", "ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4,zh-CN;q=0.2,zh;q=0.2"); */ //m_pWebPage->load(requests); m_pWebPage->load(url); /* m_pNetworkRequest->setUrl(url); m_pNetworkRequest->setRawHeader( "Accept-Language", "ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4,zh-CN;q=0.2,zh;q=0.2" ); m_pWebPage->mainFrame()->load(*m_pNetworkRequest); */ } void SNaverBlog::slotLoadFinished(bool ok) { qDebug() << "slotLoadFinished"; if (!ok) { signalError(E_ERROR_CODE::UNKNOWN_ERROR, "UnkownError/BlockError/ProxyError"); return; } qDebug() << "slotLoadFinished"; switch(m_eMode) { case E_CRAWL_MODE::BODY: { if (m_bBodyProcessed) return; qDebug() << "body loadfinished"; m_pWebPage->runJavaScript(scriptArticleDate, std::bind(&SNaverBlog::cbBodyArticleDate,this, _1)); m_pWebPage->runJavaScript(scriptArticleData, std::bind(&SNaverBlog::cbBodyArticleData,this, _1)); m_pWebPage->runJavaScript(scriptArticleNickname, std::bind(&SNaverBlog::cbBodyArticleNickname, this, _1)); m_pWebPage->runJavaScript(scriptArticleOrder, std::bind(&SNaverBlog::cbBodyArticleOrder,this, _1)); m_pWebPage->runJavaScript(scriptArticleTitle, std::bind(&SNaverBlog::cbBodyArticleTitle,this, _1)); //m_pWebPage->runJavaScript(scriptReplyUrl, std::bind(&SNaverBlog::cbBodyReplyUrl,this, _1)); m_pWebPage->runJavaScript(scriptLikeCount, std::bind(&SNaverBlog::cbBodyReplyUrl,this, _1)); return; } case E_CRAWL_MODE::REPLY: { if (m_bReplyProcessed) return; qDebug() << "reply loadfinished"; m_pWebPage->runJavaScript(scriptReply, std::bind(&SNaverBlog::cbReply, this, _1)); /* m_pWebPage->toHtml([](const QString &str){ QFile file("reply.html"); if (!file.open(QIODevice::WriteOnly | QIODevice::Text | QIODevice::Append)) return; QTextStream out(&file); out << str << "\n"; file.close(); }); */ break; } case E_CRAWL_MODE::LIKE: { m_pWebPage->runJavaScript(scriptLike, std::bind(&SNaverBlog::cbLike,this, _1)); break; } } //emit signalDataOk(m_efData); } /* void SNaverBlog::slotLoadFinished(bool ok) { if (!ok) { signalError(E_ERROR_CODE::UNKNOWN_ERROR, "UnkownError/BlockError/ProxyError"); return; } switch(m_eMode) { case E_CRAWL_MODE::BODY: { if (m_bBodyProcessing || m_bBodyProcessed) return; BoolController(m_bBodyProcessing); m_efData.body = getBody(); if (!check(m_efData.body) && m_nBodyRetry-- > 0) { QTimer::singleShot(TIME, this, &SNaverBlog::slotLoadFinished); return; } m_bBodyProcessed = true; break; } case E_CRAWL_MODE::REPLY: { if (m_bReplyProcessed || m_bReplyProcessing) return; BoolController(m_bBodyProcessing); m_efData.reply = getReply(); m_bReplyProcessed = true; break; } } emit signalDataOk(); } */ /* DataForm SNaverBlog::getBody() { DataForm data; getBody(m_pWebPage->mainFrame(), data); return data; } */ bool SNaverBlog::check(const DataForm &_data) { if (_data.data[ARTICLE_NICKNAME].isEmpty() && (_data.data[ARTICLE_DATE] == "1990-01-01 00:00:00" || _data.data[ARTICLE_DATE].isEmpty())) return false; else return true; } bool SNaverBlog::check(const QVector &_data) { return true; } QString SNaverBlog::makeReplyUrl(const QString& _url) { QStringList strList = _url.split("/"); QString strOut = ""; if((strList.size() > 4) && (strList.at(2).compare("blog.naver.com") == 0)) { //strOut = _strUrl; strOut = "http://blog.naver.com"; strOut += "/CommentList.nhn?blogId="; strOut += strList.at(3); strOut += "&logNo="; strOut += strList.at(4); strOut += "¤tPage=&isMemolog=false&focusingCommentNo=&showLastPage=true&shortestContentAreaWidth=false"; } else if ((strList.size() > 3) && strList.at(2).contains("blog.me")) //id.blog.me { strOut = "http://blog.naver.com/"; //strOut += strList.at(2).split(".").at(0); //strOut += "/"; //strOut += strList.at(3); strOut += "/CommentList.nhn?blogId="; strOut += strList.at(2).split(".").at(0); strOut += "&logNo="; strOut += strList.at(3); strOut += "¤tPage=&isMemolog=false&focusingCommentNo=&showLastPage=true&shortestContentAreaWidth=false"; } else emit signalError(E_ERROR_CODE::OUT_DATED_CRAWLER, "Check Body and ReplyUrl\n" + strOut); return strOut; } QString SNaverBlog::makeLikeUrl(const QString& _url) { QStringList strList = _url.split("/"); QString strOut = ""; if(strList.size() < 4) emit signalError(E_ERROR_CODE::OUT_DATED_CRAWLER, "Check Body and LikeUrl\n"); //url example : http://blog.naver.com/SympathyHistoryList.nhn?blogId=yewonerang&logNo=220984900374 strOut += "http://blog.naver.com/SympathyHistoryList.nhn?blogId="; strOut += strList.at(3); strOut += "&logNo="; strOut += strList.at(4); return strOut; } void SNaverBlog::cbBodyArticleDate(const QVariant& _result) { //qDebug() << "articledate"; bool ok; //static int Retry = 2; QString strDate = _result.toString(); strDate = strDate.trimmed().replace("/", "-").replace(".", "-").replace("- "," ").replace("T", " "); if (!strDate.isEmpty()) strDate += ":00"; else strDate = "1990-01-01 00:00:00"; /* qDebug() << "Body Article Date : " << strDate; if (strDate.trimmed().isEmpty() && (Retry-- > 0)) { QTimer::singleShot(TIME, this, [this](){ this->m_pWebPage->runJavaScript(scriptArticleDate, std::bind(&SNaverBlog::cbBodyArticleDate, this, _1)); }); return; } */ m_efData.body.data[ARTICLE_DATE] = strDate; ok = doneBodyCrawler(E_FUNC_ARTICLE_DATE); //qDebug() << "done articledate"; if (ok) goReplyUrl(); } void SNaverBlog::cbBodyArticleData(const QVariant& _result) { //qDebug() << "articledata"; bool ok; //static int Retry = 2; QString strData = _result.toString(); strData = GetSafeUtf(strData.trimmed()); m_efData.body.data[ARTICLE_DATA] = strData; ok = doneBodyCrawler(E_FUNC_ARTICLE_DATA); //qDebug() << "done articledata"; if (ok) goReplyUrl(); } void SNaverBlog::cbBodyArticleTitle(const QVariant& _result) { //qDebug() << "articletitle"; bool ok; //static int Retry = 2; QString strData = _result.toString(); strData = GetSafeUtf(strData.trimmed()); m_efData.body.data[ARTICLE_TITLE] = strData; ok = doneBodyCrawler(E_FUNC_ARTICLE_TITLE); //qDebug() << "done articletitle"; if (ok) goReplyUrl(); } void SNaverBlog::cbBodyArticleNickname(const QVariant& _result) { //qDebug() << "articlenickname"; bool ok; //static int Retry = 2; QString strData = _result.toString(); strData = GetSafeUtf(strData.trimmed()); m_efData.body.data[ARTICLE_NICKNAME] = strData; ok = doneBodyCrawler(E_FUNC_ARTICLE_NICKNAME); //qDebug() << "done articlenickname"; if (ok) goReplyUrl(); } void SNaverBlog::cbBodyArticleOrder(const QVariant& _result) { //qDebug() << "articleorder"; bool ok; //static int Retry = 2; QString strData = _result.toString(); strData = strData.trimmed().replace(",",""); strData = strData.replace(QRegExp("[\\D]+"), ""); m_efData.body.data[ARTICLE_ORDER] = strData; ok = doneBodyCrawler(E_FUNC_ARTICLE_ORDER); //qDebug() << "done articleorder"; if (ok) goReplyUrl(); } void SNaverBlog::cbBodyReplyUrl(const QVariant& _result) { //qDebug() << "replyurl"; bool ok; static int Retry = 2; QString strData = _result.toString().trimmed().replace(",", ""); if ((strData == "-1" || strData.isEmpty()) && Retry-- > 0) { QTimer::singleShot(TIME, this, [this](){ this->m_pWebPage->runJavaScript(scriptReplyUrl, std::bind(&SNaverBlog::cbBodyReplyUrl, this, _1)); }); return; } m_efData.body.data[REPLY_URL] = strData; ok = doneBodyCrawler(E_FUNC_REPLY_URL); //qDebug() << "done replyurl"; if (ok) goReplyUrl(); } void SNaverBlog::cbBodyLikeCount(const QVariant& _result) { //qDebug() << "replyurl"; bool ok; static int Retry = 2; QString strData = _result.toString().trimmed().replace(",", ""); if ((strData == "-1" || strData.isEmpty()) && Retry-- > 0) { QTimer::singleShot(TIME, this, [this](){ this->m_pWebPage->runJavaScript(scriptLikeCount, std::bind(&SNaverBlog::cbBodyLikeCount, this, _1)); }); return; } m_efData.body.data[LIKE_COUNT] = strData; ok = doneBodyCrawler(E_FUNC_REPLY_URL); //qDebug() << "done replyurl"; if (ok) goReplyUrl(); } bool SNaverBlog::doneBodyCrawler(E_FUNC _func_type) { bool ok = true; Lock lock(&m_mutexBody); m_abOk[_func_type] = true; for (int i = 0; i < E_FUNC_MAX; ++i) { ok &= m_abOk[i]; } return ok; } void SNaverBlog::goReplyUrl() { m_bBodyProcessed = true; /* qDebug() << m_efData.body.data[ARTICLE_TITLE]; qDebug() << m_efData.body.data[REPLY_URL]; qDebug() << m_efData.body.data[ARTICLE_NICKNAME]; qDebug() << m_efData.body.data[ARTICLE_DATE]; qDebug() << m_efData.body.data[ARTICLE_DATA]; qDebug() << m_efData.body.data[ARTICLE_ORDER]; */ if (!check(m_efData.body)) { emit signalError(E_ERROR_CODE::BLOCK_ERROR, "Block or Check naver"); return; } go(makeReplyUrl(m_strUrl), E_CRAWL_MODE::REPLY); } void SNaverBlog::goLikeUrl() { m_bBodyProcessed = true; if (!check(m_efData.body)) { emit signalError(E_ERROR_CODE::BLOCK_ERROR, "Block or Check naver"); return; } go(makeLikeUrl(m_strUrl), E_CRAWL_MODE::LIKE); } void SNaverBlog::cbReply(const QVariant& _result) { //id, nickname, date, commment, order, strParent; if (_result.isValid() && !_result.isNull()) { QList results = _result.toList(); foreach (auto &result, results) { QStringList slResult = result.toStringList(); if (slResult.size() == 6) { DataForm form; form.data[ARTICLE_ID] = slResult.at(0).trimmed(); form.data[ARTICLE_NICKNAME] = GetSafeUtf(slResult.at(1).trimmed()); form.data[ARTICLE_DATE] = slResult.at(2).trimmed().replace(".", "-").replace("/", "-").replace("T", " "). replace("- ", " ") + ":00"; form.data[ARTICLE_DATA] = GetSafeUtf(slResult.at(3).trimmed()); form.data[ARTICLE_ORDER] = slResult.at(4).trimmed(); form.data[ARTICLE_PARENT] = (slResult.at(5).trimmed() == "-1") ? "" : slResult.at(5).trimmed(); m_efData.reply.append(form); } } } /* qDebug() << "cbReply"; qDebug() << m_efData.reply.size(); */ // emit signalDataOk(m_efData); goLikeUrl(); } void SNaverBlog::cbLike(const QVariant& _result) { if (_result.isValid() && !_result.isNull()) { QList results = _result.toList(); foreach (auto &result, results) { QStringList slResult = result.toStringList(); QString date = ((QString)slResult.at(0)).replace(".", ""); m_efData.like[date] = ((QString)slResult.at(1)).toInt(); } } emit signalDataOk(m_efData); }