#include "scrawler.h" #include #include #include #include #include #include #include using namespace std; #include #include #include "data.h" void SCrawler::Debug(QString _strFilename,QString _strData) { QFile file(_strFilename); if (!file.open(QIODevice::WriteOnly | QIODevice::Text | QIODevice::Append)) return; QTextStream out(&file); out << _strData; file.close(); } SCrawler::SCrawler(QObject *parent) : QObject(parent) , m_bUse(false) { m_page = new QWebPage; connect(m_page, SIGNAL(loadFinished(bool)), this, SLOT(saveResult(bool))); } SCrawler::~SCrawler() { } void SCrawler::load(QStringList _strlistArgv) { if (_strlistArgv.at(0) == "naver_news") { m_strUrl = _strlistArgv.at(1); m_data.setData(m_strUrl, SCrawlerData::ARTICLE_URL); m_reply.SetSelect(E_NAVER_NEWS); } if (_strlistArgv.at(0) == "daum_cafe") { m_strUrl = _strlistArgv.at(1); m_reply.SetSelect(E_DAUM_CAFE); QStringList strlist = m_strUrl.split("?"); if(strlist.size() > 1) m_data.setData(strlist.at(0).trimmed(), SCrawlerData::ARTICLE_URL); else m_data.setData(m_strUrl, SCrawlerData::ARTICLE_URL); } cout << m_strUrl.toStdString() << endl; QUrl url = QUrl(m_strUrl); if (url.scheme().isEmpty()) url.setScheme("http"); m_page->settings()->setAttribute(QWebSettings::AutoLoadImages,false); QNetworkRequest *request = new QNetworkRequest; request->setUrl(url); m_data.setTable("data_"+_strlistArgv.at(2)); m_data.setData(_strlistArgv.at(3), SCrawlerData::KEYWORD_ID); request->setRawHeader("Accept-Language","ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4,zh-CN;q=0.2,zh;q=0.2"); m_page->mainFrame()->load(*request); m_data.deleteDB(m_data.getData(SCrawlerData::ARTICLE_URL),SCrawlerData::ARTICLE_URL); } QWebElement SCrawler::Find(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib="",const QString _strFind="") { QWebElementCollection elements = _FindElement.findAll(_strElement); foreach (QWebElement element, elements) { if (element.attribute(_strAttrib) == _strFind) { return element; } } QWebElement element; return element; } QWebElement SCrawler::FindMid(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind, const int _strStart, const int _strLength) { QWebElementCollection elements = _FindElement.findAll(_strElement); foreach (QWebElement element, elements) { QString str = element.attribute(_strAttrib).trimmed().mid(_strStart,_strLength); if (QString::compare(str,_strFind,Qt::CaseInsensitive)==0) { return element; } } QWebElement element; return element; } QWebElement SCrawler::FindMid(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind, const int _strStart) { int _strLength = _strFind.length(); return FindMid(_FindElement, _strElement, _strAttrib, _strFind, _strStart, _strLength); } QWebElement SCrawler::FindLeft(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind) { int _strStart = 0; return FindMid(_FindElement, _strElement, _strAttrib, _strFind, _strStart); } QList SCrawler::FindAllMid(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind, const int _strStart, const int _strLength) { QWebElementCollection elements = _FindElement.findAll(_strElement); QList returnElements = QList(); foreach (QWebElement element, elements) { QString str = element.attribute(_strAttrib).trimmed().mid(_strStart,_strLength); if (QString::compare(str,_strFind,Qt::CaseInsensitive)==0) { returnElements.append(element); } } return returnElements; } void SCrawler::saveResult(bool ok) { if (m_bUse) return; if (!ok) cout << "Failed loading"; else { switch(m_reply.select()) { case E_NAVER_NEWS: saveResultNaverNews(); break; case E_DAUM_CAFE: saveResultDaumCafe(); break; } } if (m_bUse) cout << "ok"; else cout << "fail"; emit finished(); } void SCrawler::saveResultNaverNews() { QString strTitle,strDate,strData,strPlatID,strPlatTitle,strlike; { QWebElement element = Find(m_page->mainFrame()->documentElement(),"div","class","article_info"); { strTitle = Find(element,"h3","id","articleTitle").toPlainText(); // Title; strDate = Find(element,"span","class","t11").toPlainText(); // Date } strData = Find(m_page->mainFrame()->documentElement(),"div","id","articleBodyContents").toPlainText(); strlike = Find(m_page->mainFrame()->documentElement(),"div","class","u_likeit_module").toPlainText(); //e​ntertainment if (strTitle.isEmpty()) { QWebElement elementTitle = Find(m_page->mainFrame()->documentElement(),"div","class","end_ct_area"); strTitle = Find(elementTitle,"p","class","end_tit").toPlainText(); } //e​ntertainment if (strDate.isEmpty()) strDate = Find(element,"em").toPlainText(); if (strData.isEmpty()) strData = Find(m_page->mainFrame()->documentElement(),"div","id","articeBody").toPlainText(); if (strlike.isEmpty() == false) strData += "\r\nlike(" + QString::number(strlike.toInt()) + ")"; else { //Debug("out.html",m_page->mainFrame()->toHtml()); } element = Find(m_page->mainFrame()->documentElement(),"div","class","press_logo"); { strPlatID = Find(element,"a").attribute("href"); strPlatTitle = Find(element,"img").attribute("alt"); QStringList strlistPlat = strPlatID.split("."); if(strlistPlat.size() > 2) { if (strlistPlat.at(0) == QString("http://www")) strPlatID = strlistPlat.at(1); } } } m_data.setData(m_data.GetSafeUtf(strTitle), SCrawlerData::ARTICLE_TITLE); m_data.setData(m_data.GetSafeUtf(strData), SCrawlerData::ARTICLE_DATA); m_data.setData(strPlatID,SCrawlerData::PLATFORM_ID); m_data.setData(strPlatTitle,SCrawlerData::PLATFORM_TITLE); m_data.setData(strDate, SCrawlerData::ARTICLE_DATE); m_data.setData("naver", SCrawlerData::PLATFORM_NAME); m_data.setData("news", SCrawlerData::PLATFORM_FORM); m_data.setData("body", SCrawlerData::ARTICLE_FORM); m_data.sendDB(); saveFrameNaverNews(m_page->mainFrame()); m_reply.SetUrl(m_strUrl); m_reply.Start(&m_data); } void SCrawler::saveFrameNaverNews(QWebFrame *frame) { if (m_bUse) return; if (frame->frameName() == "ifrMemo") { m_reply.SetTotal(Find(frame->documentElement(),"strong","class","_totalcount").toPlainText().trimmed().replace(",","").toInt()); m_bUse = true; } foreach(QWebFrame *childFrame, frame->childFrames()) saveFrameNaverNews(childFrame); } void SCrawler::saveResultDaumCafe() { saveFrameDaumCafe(m_page->mainFrame()); m_data.sendDB(); m_data.setData(QString(""),SCrawlerData::ETC); m_reply.Start(&m_data); } void SCrawler::saveFrameDaumCafe(QWebFrame *frame) { if (m_bUse) return; QWebElement other = frame->documentElement().findFirst("title"); QString strTitle = other.toPlainText().trimmed().split("|").at(0).trimmed(); QString strUrl_; if (strTitle.isEmpty() == false) { QStringList strlist = m_strUrl.split("?"); if(strlist.size() > 1) { m_data.setData(strlist.at(0).trimmed(), SCrawlerData::ARTICLE_URL); strUrl_ = strlist.at(0).trimmed(); } else { m_data.setData(m_strUrl, SCrawlerData::ARTICLE_URL); strUrl_ = m_strUrl; } m_data.setData(m_data.SqlString(m_data.GetSafeUtf(strTitle)), SCrawlerData::PLATFORM_TITLE); } if (frame->frameName() == "down") { m_reply.SetDaumData(SReplyGetManage::E_DAUM_CDEPTH,Find(frame->documentElement(),"input","name","F_CDEPTH").attribute("value").trimmed()); m_data.setData(Find(frame->documentElement(),"input","name","grpid").attribute("value"),SCrawlerData::ETC); ; QString strHits; { QString strData,strDate,strNick,strID,strTitle; { QWebElement group = Find(frame->documentElement(),"div","class","bbs_contents"); strData = m_data.SqlString(group.toPlainText().trimmed()); strData = m_data.GetSafeUtf(strData); } { QWebElement group = Find(frame->documentElement(),"span","class","p11 ls0"); strDate = group.toPlainText().trimmed().replace(".","-"); strDate = strDate.replace("- "," "); if (strDate.isEmpty() == true) strDate = Find(frame->documentElement(),"input","name","PLAIN_REGDT").attribute("value"); else strDate += ":00"; } { QWebElement group = Find(frame->documentElement(),"div","class","subject"); QWebElement group2 = Find(group,"span","class","b"); strTitle = m_data.SqlString(group2.toPlainText().trimmed()); } { QWebElement group = Find(Find(frame->documentElement(),"div","class","article_writer"),"a","href","#"); strNick = group.toPlainText().trimmed(); QWebElement id = Find(frame->documentElement(),"div","class","article_writer"); QStringList list = FindLeft(id,"a","onclick","showSideView").attribute("onclick").trimmed().split(","); if (list.size() >= 2) strID = list.at(1).trimmed().replace("'",""); } QStringList strList = Find(frame->documentElement(),"div","class","article_writer").toPlainText().split("|"); foreach(QString str,strList) { QStringList substrList = str.split(" "); for(int i = 0;i < substrList.size();i++) { if((substrList.at(i).trimmed().compare("조회") == 0) || (substrList.at(i).trimmed().compare("\"조회\"") == 0)) { strHits = substrList.at(i+1).trimmed(); break; } } } { m_data.setData(strData, SCrawlerData::ARTICLE_DATA); m_data.setData(strDate, SCrawlerData::ARTICLE_DATE); m_data.setData(strNick, SCrawlerData::ARTICLE_NICKNAME); if(!strID.isEmpty()) m_data.setData(strID, SCrawlerData::ARTICLE_ID); m_data.setData(strHits, SCrawlerData::ARTICLE_HIT); m_data.setData("daum", SCrawlerData::PLATFORM_NAME); m_data.setData("cafe", SCrawlerData::PLATFORM_FORM); m_data.setData("body", SCrawlerData::ARTICLE_FORM); m_data.setData(m_strUrl.split("/").at(3), SCrawlerData::PLATFORM_ID); m_data.setData(strTitle, SCrawlerData::ARTICLE_TITLE); } } // Comment { QWebElement group = Find(frame->documentElement(),"div","class","commentDiv bg_sub"); QString strNumber = Find(Find(frame->documentElement(),"div","class","paging"),"a","onclick","return false;").toPlainText().trimmed(); m_reply.SetDaumData(SReplyGetManage::E_DAUM_TOTAL,strNumber); if (strNumber.isEmpty()) strNumber = "1"; QList elements = FindAllMid(group,"div","id","_cmt-",0,5); QString commHidden = "comment_hidden"; QString commPos = "comment_pos"; QString commReComm = "recomment_pos"; QString strParent; int nCount = (strNumber.toInt() - 1) * 50; foreach (QWebElement element, elements) { if (element.attribute("class").trimmed().right(commHidden.length()).compare(commHidden,Qt::CaseInsensitive) != 0){ if (element.attribute("class").trimmed().left(commPos.length()).compare(commPos,Qt::CaseInsensitive) == 0) { QString strData = m_data.SqlString(Find(element,"span","class","comment_contents").toPlainText().trimmed()); if (strData.isEmpty()) continue; strData = m_data.GetSafeUtf(strData); QString strID; QStringList strListID = Find(element,"a","class","b").attribute("onclick").split(","); if(strListID.length() > 2) strID = strListID.at(1).trimmed().replace("'",""); QString strNick = Find(element,"a","class","b").toPlainText().trimmed(); strParent = strNick; QString strDatetest = Find(element,"span","class","comment_date txt_sub p11 ls0").toPlainText().trimmed(); QString strDate; if(strDatetest.count(".") == 0) { strDate = QDateTime::currentDateTime().toString("yyyy-MM-dd"); strDate += (" " + strDatetest + ":00"); QDateTime getTime = QDateTime::fromString(strDate, "yyyy-MM-dd hh:mm:ss"); QDateTime nowTime = QDateTime::currentDateTime(); if(getTime > nowTime) { getTime.addDays(-1); strDate = getTime.toString("yyyy-MM-dd hh:mm:ss"); } } else { strDate = Find(element,"span","class","comment_date txt_sub p11 ls0").toPlainText().trimmed().replace(".","-").replace("- "," "); } if (strDate.isEmpty()) continue; QSqlQuery query; query.prepare(QString("insert into " + m_data.GetTable() + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,article_order,platform_id,article_hit,platform_title) VALUES ('daum','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:ROWNUM,:PLATFORMID,:HITS,:TITLE)").toUtf8()); query.bindValue(":URL",strUrl_.toUtf8()); query.bindValue(":ID",strID.toUtf8()); query.bindValue(":NICK",strNick.toUtf8()); query.bindValue(":DATA",strData.toUtf8()); query.bindValue(":DATE",strDate.toUtf8()); query.bindValue(":ROWNUM",nCount++); query.bindValue(":PLATFORMID",m_strUrl.split("/").at(3).toUtf8()); query.bindValue(":HITS",strHits.toUtf8()); query.bindValue(":TITLE",strTitle.toUtf8()); if (query.exec()==false) cout << "error : " << query.lastError().text().toStdString(); } if (element.attribute("class").trimmed().left(commReComm.length()).compare(commReComm,Qt::CaseInsensitive) == 0) { QString strData = m_data.SqlString(Find(element,"span","class","comment_contents").toPlainText().trimmed()); if (strData.isEmpty()) continue; QString strReParent = Find(element,"span","class","mention_nicknames text_counter txt_point b").toPlainText().trimmed(); if(strReParent.length() == 0) strReParent = strParent; QString strID; QStringList strListID = Find(element,"a","class","b").attribute("onclick").split(","); if(strListID.length() > 2) strID = strListID.at(1).trimmed().replace("'",""); QString strNick = Find(element,"a","class","b").toPlainText().trimmed(); QString strDatetest = Find(element,"span","class","comment_date txt_sub p11 ls0").toPlainText().trimmed(); QString strDate; if(strDatetest.count(".") == 0) { strDate = QDateTime::currentDateTime().toString("yyyy-MM-dd"); strDate += (" " + strDatetest + ":00"); QDateTime getTime = QDateTime::fromString(strDate, "yyyy-MM-dd hh:mm:ss"); QDateTime nowTime = QDateTime::currentDateTime(); if(getTime > nowTime) { getTime.addDays(-1); strDate = getTime.toString("yyyy-MM-dd hh:mm:ss"); } } else { strDate = Find(element,"span","class","comment_date txt_sub p11 ls0").toPlainText().trimmed().replace(".","-").replace("- "," "); } if (strDate.isEmpty()) continue; QSqlQuery query; query.prepare(QString("insert into " + m_data.GetTable() + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,article_parent,article_order,platform_id,article_hit,platform_title) VALUES ('daum','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:PARENT,:ROWNUM,:PLATFORMID,:HITS,:TITLE)").toUtf8()); query.bindValue(":URL",strUrl_.toUtf8()); query.bindValue(":ID",strID.toUtf8()); query.bindValue(":NICK",strNick.toUtf8()); query.bindValue(":DATA",strData.toUtf8()); query.bindValue(":DATE",strDate.toUtf8()); query.bindValue(":PARENT",strReParent.toUtf8()); //query.bindValue(":URLREPLY",m_strReper.toUtf8()); query.bindValue(":ROWNUM",nCount++); query.bindValue(":PLATFORMID",m_strUrl.split("/").at(3).toUtf8()); query.bindValue(":HITS",strHits.toUtf8()); query.bindValue(":TITLE",strTitle.toUtf8()); //QWebView::page()->mainFrame()->evaluateJavaScript(""); if (query.exec()==false) cout << "error : " << query.lastError().text().toStdString(); } } } } m_bUse = true; } else m_reply.SetDaumData(SReplyGetManage::E_DAUM_DOWNSRC, Find(frame->documentElement(),"frame","name","down").attribute("src").trimmed()); foreach(QWebFrame *childFrame, frame->childFrames()) saveFrameDaumCafe(childFrame); }