#include "scrawler.h" #include #include #include #include #include using namespace std; struct SProxyList { QString m_strAddress; int m_nPort; }; SCrawler::SCrawler():QObject() { m_page = new QWebPage; connect(m_page, SIGNAL(loadFinished(bool)), this, SLOT(saveResult(bool))); } SCrawler::~SCrawler() { } void SCrawler::load(QStringList _strlistArgv) { m_bUse = false; if (_strlistArgv[0] == "naver") { if (_strlistArgv[1] == "cafe_list") { m_strUrl = _strlistArgv[2]; m_nSelect = E_NAVER_CAFE_LIST; m_strKeywordID = _strlistArgv[4]; QFile file("proxy.txt"); if (file.open(QIODevice::ReadOnly | QIODevice::Text)) { QVector vecProxy; while (!file.atEnd()) { QString str = QString(file.readLine()); if (str.isEmpty()) continue; vecProxy.push_back(str.split(",")); } if (vecProxy.size() > 0) { QStringList strList = vecProxy.at(rand()%vecProxy.size()); switch(strList.size()) { case 1: cout << "p : " << strList.at(0).toStdString() << endl; QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0)))); break; case 2: cout << "p : " << strList.at(0).toStdString() << endl; QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt()))); break; } } } } if (_strlistArgv[1] == "cafe_data") { m_strUrl = _strlistArgv[2]; m_nSelect = E_NAVER_CAFE_DATA; m_strReper = _strlistArgv[4]; } if (_strlistArgv[1] == "blog_list") { m_strUrl = _strlistArgv[2]; m_nSelect = E_NAVER_BLOG_LIST; m_strKeywordID = _strlistArgv[4]; //cout << "ok"; QFile file("proxy.txt"); if (file.open(QIODevice::ReadOnly | QIODevice::Text)) { QVector vecProxy; while (!file.atEnd()) { QString str = QString(file.readLine()); if (str.isEmpty()) continue; vecProxy.push_back(str.split(",")); } if (vecProxy.size() > 0) { QStringList strList = vecProxy.at(rand()%vecProxy.size()); switch(strList.size()) { case 1: cout << "p : " << strList.at(0).toStdString() << endl; QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0)))); break; case 2: cout << "p : " << strList.at(0).toStdString() << endl; QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt()))); break; } } } } if (_strlistArgv[1] == "blog_url") { m_strUrl = _strlistArgv[2]; m_nSelect = E_NAVER_BLOG_BODY; //m_strReper = _strlistArgv[4]; m_bUse = true; } if (_strlistArgv[1] == "blog_comm") { m_strUrl = _strlistArgv[2]; m_nSelect = E_NAVER_BLOG_REPLY; } m_strTable = "data_" + _strlistArgv[3]; } cout << m_strUrl.toStdString() << endl; QUrl url = QUrl(m_strUrl); if (url.scheme().isEmpty()) url.setScheme("http"); m_page->settings()->setAttribute(QWebSettings::AutoLoadImages,false); QNetworkRequest *request = new QNetworkRequest; request->setUrl(url); /* request->setRawHeader("Cache-Control","max-age=0, no-cache"); request->setRawHeader("Pragma","no-cache"); request->setRawHeader("Expires","Thu, 01 Jan 1970 16:00:00 GMT"); */ if (m_strReper.isEmpty() == false && m_nSelect == E_NAVER_CAFE_DATA) request->setRawHeader("Referer",m_strReper.toLocal8Bit()); request->setRawHeader("Accept-Language","ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4,zh-CN;q=0.2,zh;q=0.2"); m_page->mainFrame()->load(*request); m_bLast = false; m_bError = false; } void SCrawler::UpdateError(QString _strError) { /* QSqlQuery sql; _strError = "'" + _strError + "'"; QString strQuery = "update " + m_strTableBody + " set ERROR = " + _strError; strQuery += "where URL='"; strQuery += m_strUrl; strQuery += "'"; QString strUtf8(strQuery.toUtf8()); sql.exec(strUtf8); */ m_bError = true; } void SCrawler::saveResult(bool ok) { if (!ok) { std::cerr << "Failed loading " << qPrintable(m_page->mainFrame()->url().toString()) << std::endl; emit finished(); return; } switch(m_nSelect) { case E_NAVER_CAFE_LIST:saveFrameCafeList(m_page->mainFrame());break; case E_NAVER_CAFE_DATA:saveFrameCafeUrl(m_page->mainFrame());break; case E_NAVER_BLOG_LIST:saveFrameList(m_page->mainFrame());break; case E_NAVER_BLOG_BODY:saveFrameUrl(m_page->mainFrame());break; case E_NAVER_BLOG_REPLY:saveFrameComment(m_page->mainFrame());break; } switch(m_nSelect) { case E_NAVER_CAFE_LIST: case E_NAVER_BLOG_LIST: if (m_bError) cout << "block";// block if (m_bLast) cout << "last"; break; case E_NAVER_BLOG_REPLY: cout << "ok"; break; case E_NAVER_CAFE_DATA: case E_NAVER_BLOG_BODY: if (m_bUse == false) { cout << "fail"; UpdateError("Error code 0"); } else { if (m_bError == false) { cout << "ok"; UpdateError("ok"); } } break; } emit finished(); } int SCrawler::GetNumber(QString _str) { QString strNumber; for (int i = 0; i < _str.size();i++) { if (_str.at(i).isNumber()) strNumber += _str.at(i); } return strNumber.toInt(); } void SCrawler::Debug(QString _strFilename,QString _strData) { QFile file(_strFilename); if (!file.open(QIODevice::WriteOnly | QIODevice::Text | QIODevice::Append)) return; QTextStream out(&file); out << _strData; file.close(); } QString SCrawler::SqlString(QString _str) { _str = _str.replace("'","\\'"); _str = _str.replace("\"","\\\""); return _str; } QString SCrawler::GetSafeUtf(QString _strData) { QString str; QChar *pch = _strData.data(); for (int i = 0; i < _strData.length(); i++) { if (pch[i].unicode() >= 12593 && pch[i].unicode() <= 12622) str += pch[i]; if (pch[i].unicode() >= 44032 && pch[i].unicode() <= 55203) str += pch[i]; if (pch[i].isDigit() || pch[i].isNumber() || pch[i].isSpace() || pch[i].isLower() || pch[i].isUpper() || pch[i].isSymbol() ) str += pch[i]; } return str; } void SCrawler::saveFrameList(QWebFrame *frame) { if (m_bUse == true) return; QWebElement eleMain = Find(frame->documentElement(),"div","class","blog section _blogBase"); QSqlQuery sql; for (int i = 0; i < 10 ; i++) { QString str = "sp_blog_"; QWebElement sub = Find(eleMain,"li","id",str+QString::number(i+1)); QString strUrl = Find(sub,"a","class","url").toPlainText(); if (strUrl.isEmpty()) { m_bLast = true; m_bUse = true; return; } QStringList strList = strUrl.split('/'); if (strList.at(0).compare("blog.naver.com") != 0 ) { cout << "x http://" << strUrl.toStdString() <toHtml()); */ QSqlQuery sql; if (frame->frameName().compare(QString("BuddyConnectIframe")) == 0) { QWebElement profile = Find(frame->documentElement(),"div","class","profile_name"); QString str = profile.toPlainText().split("\n").at(0); if (str.isEmpty() == false) { QString strQuery = "update " + m_strTable + " set article_nickname = '"; strQuery += str; strQuery += "'"; strQuery += " where article_url='"; strQuery += m_strUrl; strQuery += "'"; QString strUtf8(strQuery.toUtf8()); if (sql.exec(strUtf8)==false) { cout << "error : " << sql.lastError().text().toStdString(); UpdateError("Error code 1"); m_bUse = false; } } } if (frame->frameName().compare(QString("mainFrame")) == 0) { QString str[E_DATA_MAX]; //QString strHead[E_DATA_MAX] = {"article_nickname","article_id","article_title","article_date","article_data","platform_title"}; QString strHead[E_DATA_MAX] = {"article_nickname","article_id","article_title","article_date","article_data"}; QWebElement proTitle = Find(frame->documentElement(),"td","id","blogTitleText"); // str[E_DATA_PLATFORM_TITLE] = proTitle.toPlainText().trimmed(); QWebElement profile = Find(frame->documentElement(),"div","id","blog-profile"); { QWebElement nick = Find(profile,"strong","id","nickNameArea"); if (nick.toPlainText().isEmpty()==false) str[E_DATA_NICK] = nick.toPlainText(); if(str[E_DATA_NICK].isEmpty()) { QString strHtml = frame->toHtml(); QString strFind = "var nickName = '"; int start = strHtml.indexOf(strFind); if (start == -1) { cout << "error : nick name can not find and next again connect." << endl; } if (strHtml.at(start + strFind.length()) == QChar('\'')) { cout << "error : nick name can not find and next again connect." << endl; } else { int end = strHtml.indexOf("'",start + strFind.length()); str[E_DATA_NICK] = strHtml.mid(start + strFind.length(),end-start-strFind.length()); } } if (m_strUrl.split("/").at(3) == str[E_DATA_NICK]) { str[E_DATA_ID] = str[E_DATA_NICK]; } else { QWebElement id = Find(profile,"span","class","itemfont col"); if (id.toPlainText().isEmpty()==false) { str[E_DATA_ID] = id.toPlainText(); str[E_DATA_ID] = str[E_DATA_ID].replace("(","").replace(")",""); } if (str[E_DATA_ID].isEmpty()) str[E_DATA_ID] = m_strUrl.split("/").at(3); } //QWebElement image = Find(profile,"img","alt","프로필 이미지"); } { QWebElement post = Find(frame->documentElement(),"div","id","postListBody"); QWebElement post_top = Find(post,"table","class","post-top"); { QWebElement title = Find(post_top,"div","class","htitle"); if (title.toPlainText().isEmpty()==false) { str[E_DATA_TITLE] = title.toPlainText(); str[E_DATA_TITLE] = GetSafeUtf(str[E_DATA_TITLE]); } } { QWebElement date = Find(post_top,"p","class","date fil5 pcol2 _postAddDate"); str[E_DATA_DATE] = date.toPlainText().trimmed().replace("/","-"); if ( str[E_DATA_DATE].isEmpty() == false) { str[E_DATA_DATE] += ":00"; } else { UpdateError("Error code 4"); m_bUse = false; } } { QWebElement body = Find(post,"div","class","post-view pcol2 _param(1)"); if (body.toPlainText().isEmpty()==false) { str[E_DATA_DATA] = body.toPlainText(); str[E_DATA_DATA] = GetSafeUtf(str[E_DATA_DATA]); } } } QString strQuery = "update " + m_strTable + " set "; for(int i = 0; i < E_DATA_MAX - 1 ; i++) { strQuery += strHead[i]; strQuery += "='"; //strQuery += GetSafeUtf(SqlString(str[i].trimmed())); strQuery += str[i].trimmed(); strQuery += "'"; if( i != (E_DATA_MAX - 2) ) strQuery += ","; } strQuery += " where article_url='"; strQuery += m_strUrl; strQuery += "'"; QString strUtf8(strQuery.toUtf8()); //qDebug() << strQuery; //cout << "Query : " << strQuery.toStdString() << endl; if (sql.exec(strUtf8)==false) { cout << "error : " << sql.lastError().text().toStdString(); UpdateError("Error code 5"); m_bUse = false; } } foreach(QWebFrame *childFrame, frame->childFrames()) saveFrameUrl(childFrame); } void SCrawler::saveFrameComment(QWebFrame *frame) { QWebElement group = Find(frame->documentElement(),"ul","id","commentList"); QWebElementCollection elements = group.findAll("li"); QString strParent,strDate,strNick,strComm,strUrl; QStringList strList = m_strUrl.split("/"); for (int i=0; i < 5; i++) strUrl += strList.at(i) + "/"; strUrl = strUrl.left(strUrl.size()-1); int nCount=0; foreach (QWebElement element, elements) { if (element.attribute("class") == "_countableComment ") { strNick = strParent = Find(element,"a","class","nick pcol2").toPlainText(); strDate = Find(element,"span","class","date fil5 pcol2").toPlainText(); strComm = Find(element,"dd","class","comm pcol2").toPlainText(); strComm = GetSafeUtf(strComm); if (strComm.isEmpty()== false) { strComm.replace("'","\\'"); strComm.replace("\"","\\\""); strComm = strComm.trimmed(); //cout << strComm.toStdString() << endl; QSqlQuery query; query.prepare(QString("insert into " + m_strTable + " (article_url,article_nickname,article_data,article_date,article_parent,reply_url,article_order,platform_name,platform_form,article_form) VALUES (:URL,:NICK,:DATA,:DATE,:PARENT,:URLREPLY,:ROWNUM,'naver','blog','reply')").toUtf8()); /* cout << "m_strTable = " << m_strTable.toStdString() << endl; cout << "url = " << strUrl.toStdString() << endl << "nickname = " << strNick.toStdString() << endl; cout << "data = " << strComm.toStdString() << endl; cout << "date = " << strDate.toStdString() << endl; cout << "urlReply = " << m_strUrl.toStdString() << endl; cout << "ronum = " << nCount << endl; */ query.bindValue(":URL", strUrl.toUtf8()); query.bindValue(":NICK",strNick.toUtf8()); query.bindValue(":DATA",strComm.toUtf8()); query.bindValue(":DATE",strDate.toUtf8()); query.bindValue(":PARENT",QString("").toUtf8()); query.bindValue(":URLREPLY",m_strUrl.toUtf8()); query.bindValue(":ROWNUM",(nCount++)); if (query.exec()==false) cout << "error : " << query.lastError().text().toStdString(); } } if (element.attribute("class") == "reply _countableComment ") { strNick = Find(element,"a","class","nick pcol2").toPlainText(); strDate = Find(element,"span","class","date fil5 pcol2").toPlainText(); QWebElement subElement = Find(element,"dd","class","comm pcol2"); QString subNick = Find(subElement,"a","class","nick pcol2").toPlainText(); strComm = subElement.toPlainText(); if(subNick.isEmpty() == false) { strNick = strParent; strComm = strComm.right(strComm.size()-subNick.size()-1); } if (strComm.isEmpty() == false) { strComm = GetSafeUtf(strComm); strComm.replace("'","\\'"); strComm.replace("\"","\\\""); strComm = strComm.trimmed(); QSqlQuery query; /* cout << "m_strTable = " << m_strTable.toStdString() << endl; cout << "url = " << strUrl.toStdString() << endl << "nickname = " << strNick.toStdString() << endl; cout << "data = " << strComm.toStdString() << endl; cout << "date = " << strDate.toStdString() << endl; cout << "urlReply = " << m_strUrl.toStdString() << endl; cout << "ronum = " << nCount << endl; cout << "parent = " << strParent.toStdString() << endl; */ query.prepare(QString("insert into " + m_strTable + " (article_url,article_nickname,article_data,article_date,article_parent,reply_url,article_order,platform_name,platform_form,article_form) VALUES (:URL,:NICK,:DATA,:DATE,:PARENT,:URLREPLY,:ROWNUM,'naver','blog','reply')").toUtf8()); query.bindValue(":URL",strUrl.toUtf8()); query.bindValue(":NICK",strNick.toUtf8()); query.bindValue(":DATA",strComm.toUtf8()); query.bindValue(":DATE",strDate.toUtf8()); query.bindValue(":PARENT",strParent.toUtf8()); query.bindValue(":URLREPLY",m_strUrl.toUtf8()); query.bindValue(":ROWNUM",(nCount++)); if (query.exec()==false) { cout << "error : " << query.lastError().text().toStdString(); } } } } } void SCrawler::saveFrameCafeList(QWebFrame *frame) { if (m_bUse == true) return; static int cz = 0; Debug(frame->frameName() + QString::number(cz++) + ".html",frame->toHtml()); QWebElement eleMain = Find(frame->documentElement(),"div","class","cafe_article section _cafeBase"); foreach(QWebElement eleSub,eleMain.findAll("li")) { if (eleSub.attribute("class") == "sh_cafe_top") { QString strUrl,strTitle; foreach(QWebElement eleSubUrl,eleSub.findAll("a")) { if (eleSubUrl.attribute("class") == "url") strUrl = eleSubUrl.attribute("href"); if (eleSubUrl.attribute("class") == "sh_cafe_title") strTitle = GetSafeUtf(SqlString(eleSubUrl.toPlainText().trimmed())); } if (strUrl.split("/").at(2) == "cafe.naver.com") { QSqlQuery sql; /* QString strQuery = "select URL from "; strQuery += m_strTableBody; strQuery += QString(" where URL = '%1'").arg(strUrl); sql.exec(strQuery); if (sql.size() == 0) */ { QString strQuery = QString("insert into "); strQuery += m_strTable; strQuery += QString(" set platform_name='naver',platform_form='cafe',article_form='body',article_url='%1',platform_id='%2',article_title='%3',keyword_id='%4'").arg(strUrl).arg(strUrl.split("/").at(3)).arg(strTitle).arg(m_strKeywordID); QString strUtf8(strQuery.toUtf8()); if (sql.exec(strUtf8) == false) cout << "x " << sql.lastError().text().toStdString(); else cout << "o " << strUrl.toStdString() << endl; } } m_bUse = true; } } { QWebElement total = Find(eleMain,"span","class","title_num"); if (total.toPlainText().isEmpty()) {m_bError = true; return;} total.toPlainText().split("/").size(); int nTotal = GetNumber(total.toPlainText().split("/").at(1)); QStringList strList = m_strUrl.split("&"); int nNow = GetNumber(strList.at(strList.size() - 1).split("=").at(1)); if ((nNow + 10) > nTotal || nNow >= 1000) m_bLast = true; } } void SCrawler::saveFrameCafeUrl(QWebFrame *frame) { if (m_bUse) return; QWebElement other = Find(frame->documentElement(),"h1","class","d-none"); if (other.toPlainText().isEmpty() == false) { QString strQuery = "update "; strQuery += m_strTable; strQuery += " set "; strQuery += "platform_title = '" + SqlString(GetSafeUtf(other.toPlainText())) + "'"; strQuery += "where article_url='"; strQuery += m_strUrl; strQuery += "'"; QString strUtf8(strQuery.toUtf8()); QSqlQuery sql; if (sql.exec(strUtf8) == false) cout << "error : " << sql.lastError().text().toStdString(); } if (frame->frameName() == "cafe_main") { { QString strData,strDate,strNick,strID,strHits; { QWebElement group = Find(frame->documentElement(),"div","class","tbody m-tcol-c"); strData = SqlString(group.toPlainText().trimmed()); strData = GetSafeUtf(strData); } { QWebElement group = Find(frame->documentElement(),"td","class","m-tcol-c date"); strDate = group.toPlainText().trimmed().replace(".","-"); if (strDate.isEmpty() == true) { QWebElement subgroup = Find(frame->documentElement(),"em","class","date m-tcol-c"); strDate = subgroup.toPlainText().trimmed().replace(".","-"); strDate += " 00:00:00"; } else strDate += ":00"; } { QWebElement group = Find(Find(frame->documentElement(),"div","class","etc-box"),"td","class","p-nick"); strNick = group.toPlainText().trimmed(); if (strNick.isEmpty() == false) { QStringList list = strNick.split("("); if (list.isEmpty() == false) strNick = list.at(0); QWebElement id = Find(frame->documentElement(),"td","class","m-tcol-c b nick"); list = Find(id,"a","class","m-tcol-c b").attribute("onclick").trimmed().split(","); if (list.size() >= 2) strID = list.at(1).trimmed().replace("'",""); } else { QStringList list = Find(Find(frame->documentElement(),"td","class","m-tcol-c b nick"),"a","class","m-tcol-c b").attribute("onclick").trimmed().split(","); if (list.size() >= 4) { strID = list.at(1).trimmed().replace("'",""); strNick = list.at(3).trimmed().replace("'",""); } } } strHits = Find(frame->documentElement(),"span","class","b m-tcol-c reply _rosReadcount").toPlainText(); if (strHits.isEmpty()) { strHits = Find(frame->documentElement(),"span","class","kin_count m-tcol-c _rosReadcount").toPlainText(); } { QSqlQuery sql; QString strQuery = "update "; strQuery += m_strTable; strQuery += " set "; strQuery += "article_data = '" + strData + "',"; strQuery += "article_date = '" + strDate + "',"; strQuery += "article_nickname = '" + strNick + "',"; strQuery += "article_id = '" + strID + "',"; strQuery += "article_hit = '" + strHits + "'"; strQuery += "where article_url='"; strQuery += m_strUrl; strQuery += "'"; QString strUtf8(strQuery.toUtf8()); if (sql.exec(strUtf8) == false) cout << "error : " << sql.lastError().text().toStdString(); } } // Comment { QWebElement group = Find(frame->documentElement(),"ul","id","cmt_list"); QWebElementCollection elements = group.findAll("li"); QString strParent; int nCount = 0; foreach (QWebElement element, elements) { if (element.attribute("class").isEmpty()) { QString strData = SqlString(Find(element,"span","class","comm_body").toPlainText().trimmed()); if (strData.isEmpty()) continue; strData = GetSafeUtf(strData); QString strID = Find(element,"input","name","writerid").attribute("value").trimmed(); QString strNick = Find(element,"td","class","p-nick").toPlainText().trimmed(); strParent = strNick; QString strDate = Find(element,"span","class","date m-tcol-c filter-50").toPlainText().trimmed(); if (strDate.isEmpty()) continue; QSqlQuery query; query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,reply_url,article_order) VALUES ('naver','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:URLREPLY,:ROWNUM)").toUtf8()); query.bindValue(":URL",m_strUrl.toUtf8()); query.bindValue(":ID",strID.toUtf8()); query.bindValue(":NICK",strNick.toUtf8()); query.bindValue(":DATA",strData.toUtf8()); query.bindValue(":DATE",strDate.toUtf8()); query.bindValue(":URLREPLY",m_strReper.toUtf8()); query.bindValue(":ROWNUM",nCount++); if (query.exec()==false) cout << "error : " << query.lastError().text().toStdString(); } if (element.attribute("class") == "reply") { QString strData = Find(element,"span","class","comm_body").toPlainText().trimmed(); if (strData.isEmpty()) continue; QString strReParent = strParent; QString strID = Find(element,"input","name","writerid").attribute("value").trimmed(); QString strNick = Find(element,"td","class","p-nick").toPlainText().trimmed(); QString strDate = Find(element,"span","class","date m-tcol-c filter-50").toPlainText().trimmed(); QWebElement eleParent = Find(element,"span","class","re-p-nick"); if (eleParent.toPlainText().isEmpty() == false) strReParent = eleParent.toPlainText(); QSqlQuery query; query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,article_parent,reply_url,article_order) VALUES ('naver','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:PARENT,:URLREPLY,:ROWNUM)").toUtf8()); query.bindValue(":URL",m_strUrl.toUtf8()); query.bindValue(":ID",strID.toUtf8()); query.bindValue(":NICK",strNick.toUtf8()); query.bindValue(":DATA",strData.toUtf8()); query.bindValue(":DATE",strDate.toUtf8()); query.bindValue(":PARENT",strReParent.toUtf8()); query.bindValue(":URLREPLY",m_strReper.toUtf8()); query.bindValue(":ROWNUM",nCount++); if (query.exec()==false) cout << "error : " << query.lastError().text().toStdString(); } } } m_bUse = true; } foreach(QWebFrame *childFrame, frame->childFrames()) saveFrameCafeUrl(childFrame); } QWebElement SCrawler::Find(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind) { QWebElementCollection elements = _FindElement.findAll(_strElement); foreach (QWebElement element, elements) { if (element.attribute(_strAttrib) == _strFind) { return element; } } QWebElement element; return element; }