#include "scrawler.h" #include #include #include #include #include #include #include #include #include class SWebPage:public QWebPage { public: SWebPage(QObject * parent = 0): QWebPage(parent){} protected: void javaScriptAlert(QWebFrame * frame, const QString & msg){ std::cout << "deletedurl"; exit(1); } //bool javaScriptConfirm(QWebFrame * frame, const QString & msg){} }; using namespace std; const int RETRY_MAX = 4; const int RETRY_INTERVAL = 3000; struct SProxyList { QString m_strAddress; int m_nPort; }; SCrawler::SCrawler():QObject() { m_page = new SWebPage; m_nRetryCount = 0; m_bProcessed = false; connect(m_page, SIGNAL(loadFinished(bool)), this, SLOT(saveResult(bool))); srand(time(NULL)); } SCrawler::~SCrawler() { } void SCrawler::load(QStringList _strlistArgv) { m_bUse = false; m_bNothing = false; if (_strlistArgv[0] == "naver") { if (_strlistArgv[1] == "news_list") { m_strUrl = _strlistArgv[2]; m_nSelect = E_NAVER_NEWS_LIST; setProxy(); } if (_strlistArgv[1] == "cafe_list") { m_strUrl = _strlistArgv[2]; m_nSelect = E_NAVER_CAFE_LIST; m_strKeywordID = _strlistArgv[4]; setProxy(); } if (_strlistArgv[1] == "cafe_data") { m_strUrl = _strlistArgv[2]; m_nSelect = E_NAVER_CAFE_DATA; m_strReper = _strlistArgv[4]; m_strKeywordID = _strlistArgv[5]; } if (_strlistArgv[1] == "blog_list") { m_strUrl = _strlistArgv[2]; m_nSelect = E_NAVER_BLOG_LIST; m_strKeywordID = _strlistArgv[4]; setProxy(); } if (_strlistArgv[1] == "blog_url") { m_strUrl = _strlistArgv[2]; m_nSelect = E_NAVER_BLOG_BODY; m_strKeywordID = _strlistArgv[4]; m_bUse = true; } if (_strlistArgv[1] == "blog_comm") { m_strUrl = _strlistArgv[2]; m_nSelect = E_NAVER_BLOG_REPLY; } if (_strlistArgv[1] == "news_data") { m_strUrl = _strlistArgv[2]; m_nSelect = E_NAVER_NEWS_DATA; m_strKeywordID = _strlistArgv[4]; } if (_strlistArgv[1] == "news_comm") { m_strUrl = _strlistArgv[2]; m_nSelect = E_NAVER_NEWS_REPLY; } if (_strlistArgv.size() > 3) m_strTable = "data_" + _strlistArgv[3]; } if (_strlistArgv[0] == "daum") { if (_strlistArgv[1] == "cafe_list") { m_strUrl = _strlistArgv[2]; m_nSelect = E_DAUM_CAFE_LIST; m_strKeywordID = _strlistArgv[4]; setProxy(); } if (_strlistArgv[1] == "cafe_data") { m_strUrl = _strlistArgv[2]; m_nSelect = E_DAUM_CAFE_DATA; m_strReper = _strlistArgv[4]; m_strKeywordID = _strlistArgv[5]; } if (_strlistArgv[1] == "blog_list") { m_strUrl = _strlistArgv[2]; m_nSelect = E_DAUM_BLOG_LIST; m_strKeywordID = _strlistArgv[4]; //cout << "ok"; setProxy(); } if (_strlistArgv[1] == "blog_url") { m_strUrl = _strlistArgv[2]; m_nSelect = E_DAUM_BLOG_BODY; //m_strReper = _strlistArgv[4]; m_bUse = true; } if (_strlistArgv[1] == "blog_comm") { m_strUrl = _strlistArgv[2]; m_nSelect = E_DAUM_BLOG_REPLY; } m_strTable = "data_" + _strlistArgv[3]; } cout << m_strUrl.toStdString() << endl; QUrl url = QUrl(m_strUrl); if (url.scheme().isEmpty()) url.setScheme("http"); m_page->settings()->setAttribute(QWebSettings::AutoLoadImages,false); QNetworkRequest *request = new QNetworkRequest; request->setUrl(url); /* request->setRawHeader("Cache-Control","max-age=0, no-cache"); request->setRawHeader("Pragma","no-cache"); request->setRawHeader("Expires","Thu, 01 Jan 1970 16:00:00 GMT"); */ if (m_strReper.isEmpty() == false && m_nSelect == E_NAVER_CAFE_DATA) request->setRawHeader("Referer",m_strReper.toLocal8Bit()); request->setRawHeader("Accept-Language","ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4,zh-CN;q=0.2,zh;q=0.2"); m_page->mainFrame()->load(*request); m_bLast = false; m_bError = false; } void SCrawler::UpdateError(QString _strError) { /* QSqlQuery sql; _strError = "'" + _strError + "'"; QString strQuery = "update " + m_strTableBody + " set ERROR = " + _strError; strQuery += "where URL='"; strQuery += m_strUrl; strQuery += "'"; QString strUtf8(strQuery.toUtf8()); sql.exec(strUtf8); */ m_bError = true; } void SCrawler::saveResult(bool ok) { // qDebug() << "saveResult"; if (!ok) { cout << "Failed loading"; deleteProxy(); emit finished(); return; } //qDebug() << "load complete"; switch(m_nSelect) { case E_NAVER_NEWS_LIST:saveFrameNewsList(m_page->mainFrame());break; case E_NAVER_NEWS_DATA: { static bool loaded = false; if(!loaded) { loaded = true; if(!saveFrameNewsUrl(m_page->mainFrame())) { loaded = false; return; } bodydata.sendDB(); } else return; loaded = true; break; } case E_NAVER_NEWS_REPLY: { if(!saveFrameNewsComment(m_page->mainFrame())) return; break; } case E_NAVER_CAFE_LIST:saveFrameCafeList(m_page->mainFrame());break; case E_NAVER_CAFE_DATA: { saveFrameCafeUrl(m_page->mainFrame()); bodydata.sendDB(); break; } case E_NAVER_BLOG_LIST: { if(saveFrameList(m_page->mainFrame())) break; else return; } case E_NAVER_BLOG_BODY: { if(!saveFrameUrl(m_page->mainFrame())) return; bodydata.sendDB(); break; } case E_NAVER_BLOG_REPLY:saveFrameComment(m_page->mainFrame());break; case E_DAUM_CAFE_LIST:saveFrameDaumCafeList(m_page->mainFrame());break; case E_DAUM_CAFE_DATA: { saveFrameDaumCafeUrl(m_page->mainFrame()); bodydata.sendDB(); break; } case E_DAUM_BLOG_LIST:saveFrameDaumBlogList(m_page->mainFrame());break; case E_DAUM_BLOG_BODY: { saveFrameDaumBlogUrl(m_page->mainFrame()); bodydata.sendDB(); break; } case E_DAUM_BLOG_REPLY:saveFrameDaumBlogComment(m_page->mainFrame());break; } switch(m_nSelect) { case E_NAVER_CAFE_LIST: case E_NAVER_BLOG_LIST: case E_DAUM_CAFE_LIST: case E_DAUM_BLOG_LIST: case E_NAVER_NEWS_LIST: if (m_bError) { cout << "block";// block deleteProxy(); break; } if (m_bNothing == false) { cout << "nothing"; m_bNothing = true; } if (m_bLast) { cout << "last"; m_bLast = false; } break; case E_NAVER_BLOG_REPLY: case E_NAVER_NEWS_REPLY: case E_DAUM_BLOG_REPLY: if (m_bUse) { cout << "ok"; m_bUse = false; } break; case E_NAVER_CAFE_DATA: case E_NAVER_BLOG_BODY: case E_DAUM_CAFE_DATA: case E_DAUM_BLOG_BODY: case E_NAVER_NEWS_DATA: if (m_bUse == false) { cout << "fail"; UpdateError("Error code 0"); } else { if (m_bError == false) { cout << "ok"; UpdateError("ok"); } } break; } qDebug() << "finish"; emit finished(); } int SCrawler::GetNumber(QString _str) { QString strNumber; for (int i = 0; i < _str.size();i++) { if (_str.at(i).isNumber()) strNumber += _str.at(i); } return strNumber.toInt(); } int SCrawler::GetNumber(QString _str, bool &ok) { QString strNumber; for (int i = 0; i < _str.size();i++) { if (_str.at(i).isNumber()) strNumber += _str.at(i); } return strNumber.toInt(&ok); } void SCrawler::Debug(QString _strFilename,QString _strData) { QFile file(_strFilename); if (!file.open(QIODevice::WriteOnly | QIODevice::Text | QIODevice::Append)) return; QTextStream out(&file); out << _strData; file.close(); } QString SCrawler::SqlString(QString _str) { _str = _str.replace("'","\\'"); _str = _str.replace("\"","\\\""); return _str; } QString SCrawler::GetSafeUtf(QString _strData) { QString str; QChar *pch = _strData.data(); for (int i = 0; i < _strData.length(); i++) { if (pch[i].unicode() >= 12593 && pch[i].unicode() <= 12622) str += pch[i]; if (pch[i].unicode() >= 44032 && pch[i].unicode() <= 55203) str += pch[i]; if (pch[i].isDigit() || pch[i].isNumber() || pch[i].isSpace() || pch[i].isLower() || pch[i].isUpper() || pch[i].isSymbol() ) str += pch[i]; } return str; } void SCrawler::reloadListPage() { ++m_nRetryCount; if (m_nRetryCount >= RETRY_MAX) { cout << "block"; emit finished(); return; } m_bProcessed = false; saveResult(true); } bool SCrawler::saveFrameList(QWebFrame *frame) { if (m_bProcessed == false) m_bProcessed = true; else return false; //qDebug() << frame->documentElement().toPlainText(); if (m_bUse == true) return true; QWebElement notFound = Find(frame->documentElement(),"div","id","notfound"); if(notFound.isNull() == false) { m_bLast = true; return true; } QWebElement naverBlock = Find(frame->documentElement(), "p", "class", "info01"); if (!naverBlock.isNull()) { m_bError = true; cout << "naver"; return true; } QWebElement eleMain = Find(frame->documentElement(),"div","class","blog section _blogBase"); QStringList urlList; if (eleMain.isNull()) { QTimer::singleShot(RETRY_INTERVAL, this, SLOT(reloadListPage())); return false; } for (int i = 0; i < 10 ; i++) { QString str = "sp_blog_"; QWebElement sub = Find(eleMain,"li","id",str+QString::number(i+1)); QString strUrl = Find(sub,"a","class","url").attribute("href"); if (strUrl.isEmpty()) { //m_bLast = true; //m_bUse = true; break; } strUrl = strUrl.replace("http://",""); strUrl = strUrl.replace("?Redirect=Log&logNo=","/", Qt::CaseInsensitive); QStringList strList = strUrl.split('/'); QString strBlogMe = "blog.me"; if ((strList.at(0).compare("blog.naver.com") != 0 ) && (strList.at(0).right(strBlogMe.length()).compare(strBlogMe) != 0)) { continue; } if((strList.at(0).right(strBlogMe.length()).compare(strBlogMe) == 0)) { QStringList strSubList = strList.at(0).split('.'); strUrl = "blog.naver.com/"; strUrl += strSubList.at(0); strUrl += "/"; strUrl += strList.at(1); } urlList << QString("http://%1").arg(strUrl); } if(urlList.size() > 0) { QString strUrlList; strUrlList = "("; foreach(QString str, urlList) { strUrlList += "'"; strUrlList += str; strUrlList += "',"; } strUrlList = strUrlList.left(strUrlList.size() - 1); strUrlList += ")"; QSqlQuery sql; QString strQuery = "delete from "; strQuery += m_strTable; strQuery += QString(" where article_url in %1").arg(strUrlList); //qDebug() << strQuery; if (sql.exec(strQuery.toUtf8()) == false) { cout << "error " << sql.lastError().text().toStdString(); cout << strQuery.toStdString(); } } for (int i = 0; i < 10 ; i++) { QString str = "sp_blog_"; QWebElement sub = Find(eleMain,"li","id",str+QString::number(i+1)); QString strUrl = Find(sub,"a","class","url").attribute("href"); if (strUrl.isEmpty()) { //m_bLast = true; m_bUse = true; break; } strUrl = strUrl.replace("http://",""); strUrl = strUrl.replace("?Redirect=Log&logNo=","/", Qt::CaseInsensitive); QStringList strList = strUrl.split('/'); QString strBlogMe = "blog.me"; if ((strList.at(0).compare("blog.naver.com") != 0 ) && (strList.at(0).right(strBlogMe.length()).compare(strBlogMe) != 0)) { cout << "x http://" << strUrl.toStdString() <toHtml()); //QSqlQuery sql; if (frame->frameName().compare(QString("BuddyConnectIframe")) == 0) { QWebElement profile = Find(frame->documentElement(),"div","class","profile_name"); QString str = profile.toPlainText().split("\n").at(0); if (str.isEmpty() == false) bodydata.setData(m_strUrl, bodydata.ARTICLE_URL); } if (frame->frameName().compare(QString("mainFrame")) == 0) { QString str[E_DATA_MAX]; QString sympathy; QString numofReply; QString strProfile; QWebElement proTitle = Find(frame->documentElement(),"meta","property","og:article:author"); str[E_DATA_PLATFORM_TITLE] = proTitle.attribute("content").split("|").at(1).trimmed(); if(str[E_DATA_PLATFORM_TITLE].length() > 0) str[E_DATA_PLATFORM_TITLE] = GetSafeUtf(str[E_DATA_PLATFORM_TITLE]); else { proTitle = Find(frame->documentElement(),"span","id","blogTitleName"); str[E_DATA_PLATFORM_TITLE] = GetSafeUtf(proTitle.toPlainText().trimmed()); } QWebElement image; QWebElement profile = Find(frame->documentElement(),"div","id","blog-profile"); { QWebElement nick = Find(profile,"strong","id","nickNameArea"); if (nick.toPlainText().isEmpty()==false) str[E_DATA_NICK] = nick.toPlainText(); if(str[E_DATA_NICK].isEmpty()) { QString strHtml = frame->toHtml(); QString strFind = "var nickName = '"; int start = strHtml.indexOf(strFind); if (start == -1) { cout << "error : nick name can not find and next again connect." << endl; } if (strHtml.at(start + strFind.length()) == QChar('\'')) { cout << "error : nick name can not find and next again connect." << endl; } else { int end = strHtml.indexOf("'",start + strFind.length()); str[E_DATA_NICK] = strHtml.mid(start + strFind.length(),end-start-strFind.length()); } } str[E_DATA_NICK] = GetSafeUtf(str[E_DATA_NICK]); if (m_strUrl.split("/").at(3).trimmed() == str[E_DATA_NICK].trimmed()) { str[E_DATA_ID] = str[E_DATA_NICK]; } else { if (str[E_DATA_ID].isEmpty()) { if((m_strUrl.split("/").at(2).compare("blog.naver.com") == 0)) str[E_DATA_ID] = m_strUrl.split("/").at(3); else str[E_DATA_ID] = m_strUrl.split("/").at(2).split(".").at(0); } } if(str[E_DATA_NICK].length() == 0) str[E_DATA_NICK] = str[E_DATA_ID]; image = Find(profile,"img","alt","프로필 이미지"); strProfile = Find(profile, "p", "class", "caption align").toPlainText().trimmed(); } { QWebElement post = Find(frame->documentElement(),"div","id","postListBody"); { QWebElement weCmt = post.findFirst("a[class^='pcol2 _cmtList']"); if (!weCmt.isNull()) { numofReply = weCmt.toPlainText().replace(",", "").trimmed(); numofReply = numofReply.replace(QRegExp("[\\D]"), ""); } } QWebElement post_top = Find(post,"table","class","post-top"); { QWebElement title = Find(post_top,"span","class","pcol1 itemSubjectBoldfont"); if(title.isNull()) { title = Find(frame->documentElement(), "div", "class", "se_textView"); } if(title.isNull()) { title = Find(frame->documentElement(), "h3", "class", "se_textarea"); } if (title.toPlainText().isEmpty()==false) { str[E_DATA_TITLE] = title.toPlainText(); str[E_DATA_TITLE] = GetSafeUtf(str[E_DATA_TITLE]); } } { QWebElement date = Find(post_top,"p","class","date fil5 pcol2 _postAddDate"); if(date.isNull()) { date = Find(frame->documentElement(), "span","class","se_publishDate pcol2 fil5"); } str[E_DATA_DATE] = date.toPlainText().trimmed().replace("/","-"); if ( str[E_DATA_DATE].isEmpty() == false) { str[E_DATA_DATE] += ":00"; } else { UpdateError("Error code 4"); m_bUse = false; } } { //QWebElement body = Find(post,"div","class","post-view pcol2 _param(1)"); QWebElement body = post.findFirst("div[class^='post-view pcol2 _param(1)']"); if(body.isNull()) body = post.findFirst("div[class*='pcol2 _param(1)']"); if(body.isNull()) body = Find(post, "class", "se_component_wrap sect_dsc __se_component_area"); if (body.toPlainText().isEmpty()==false) { str[E_DATA_DATA] = body.toPlainText(); str[E_DATA_DATA] = GetSafeUtf(str[E_DATA_DATA]); } } { QWebElement WEsympathy = Find(frame->documentElement(),"div","class","btn_like pcol2"); if(WEsympathy.isNull()) { sympathy = "0"; } else { sympathy = WEsympathy.toPlainText().trimmed(); } //qDebug() << "Sympathy: " << sympathy; //qDebug() << strProfile; } { //retry if profile is empty and sympathy is empty if((strProfile.isEmpty() || sympathy.isEmpty()) && (m_nRetryCount < RETRY_MAX)) { m_nRetryCount++; qDebug() << m_nRetryCount; QTimer::singleShot(RETRY_INTERVAL, this, SLOT(reloadPage())); return false; } } } //QString strHead[E_DATA_MAX] = {"article_nickname","article_id","article_title","article_date","article_data","platform_title"}; bodydata.setData(str[0].trimmed(), bodydata.ARTICLE_NICKNAME); bodydata.setData(str[1].trimmed(), bodydata.ARTICLE_ID); bodydata.setData(str[2].trimmed(), bodydata.ARTICLE_TITLE); bodydata.setData(str[3].trimmed(), bodydata.ARTICLE_DATE); bodydata.setData(str[4].trimmed(), bodydata.ARTICLE_DATA); bodydata.setData(str[5].trimmed(), bodydata.PLATFORM_TITLE); if(image.attribute("src").trimmed().length() != 0) { bodydata.setData(image.attribute("src").trimmed(), bodydata.ARTICLE_PROFILEURL); } strProfile = GetSafeUtf(strProfile); if(strProfile.length() > 0) { bodydata.setData(strProfile, bodydata.ARTICLE_PROFILE); } bodydata.setData(m_strUrl, bodydata.ARTICLE_URL); bodydata.setTable(m_strTable); //bodydata.setData(sympathy, bodydata.ARTICLE_HIT); //original data bodydata.setData(numofReply, bodydata.ARTICLE_ORDER); bodydata.setData(sympathy, bodydata.REPLY_URL); bodydata.setData("naver", bodydata.PLATFORM_NAME); bodydata.setData("blog", bodydata.PLATFORM_FORM); bodydata.setData("body", bodydata.ARTICLE_FORM); bodydata.setData(m_strUrl.split("/").at(3), bodydata.PLATFORM_ID); bodydata.setData(m_strKeywordID, bodydata.KEYWORD_ID); } bool b_ok = true; foreach(QWebFrame *childFrame, frame->childFrames()) b_ok = (b_ok && saveFrameUrl(childFrame)); return b_ok; } void SCrawler::reloadPage() { //qDebug() << "reloadPage called"; saveResult(true); } void SCrawler::saveFrameComment(QWebFrame *frame) { QWebElement group = Find(frame->documentElement(),"ul","id","commentList"); QWebElementCollection elements = group.findAll("li"); QString strParent,strDate,strNick,strComm,strUrl,strId; QStringList strList = m_strUrl.split("/"); QString strCommUrl; for (int i=0; i < strList.size() - 1; i++) strUrl += strList.at(i) + "/"; { int nStartIdPos = m_strUrl.indexOf("blogId=") + QString("blogId=").size(); int nEndIdPos = m_strUrl.indexOf('&', nStartIdPos); strUrl += m_strUrl.mid(nStartIdPos, nEndIdPos - nStartIdPos) + '/'; } { int nStartIdPos = m_strUrl.indexOf("logNo=") + QString("logNo=").size(); int nEndIdPos = m_strUrl.indexOf('&', nStartIdPos); strUrl += m_strUrl.mid(nStartIdPos, nEndIdPos - nStartIdPos); } int nCount=0; foreach (QWebElement element, elements) { if (element.attribute("class") == "_countableComment ") { strNick = strParent = GetSafeUtf(Find(element,"a","class","nick pcol2").toPlainText()); strDate = Find(element,"span","class","date fil5 pcol2").toPlainText(); strComm = Find(element,"dd","class","comm pcol2").toPlainText(); strCommUrl = Find(element,"a","class","nick pcol2").attribute("href"); if(strCommUrl.left(QString("http://blog.naver.com").length()).compare("http://blog.naver.com") == 0) strId = strCommUrl.split("/").at(3).trimmed(); if(strCommUrl.right(QString("blog.me").length()).compare("blog.me") == 0) strId = strCommUrl.split("/").at(2).split(".").at(0).trimmed(); if(strCommUrl.left(1) == "/") { QStringList strList = strCommUrl.split("&"); foreach(QString str, strList) { if(str.left(3) == "id=") { strId = str.right(str.length() - 3); } } } strComm = GetSafeUtf(strComm); if (strComm.isEmpty()== false) { strComm.replace("'","\\'"); strComm.replace("\"","\\\""); strComm = strComm.trimmed(); QSqlQuery query; if(strId.length() > 0) { query.prepare(QString("insert into " + m_strTable + " (article_id,article_url,platform_id,article_nickname,article_data,article_date,reply_url,article_order,platform_name,platform_form,article_form) VALUES (:ID,:URL,:PLATFORMID,:NICK,:DATA,:DATE,:URLREPLY,:ROWNUM,'naver','blog','reply')").toUtf8()); query.bindValue(":ID", strId.toUtf8()); } else query.prepare(QString("insert into " + m_strTable + " (article_url,platform_id,article_nickname,article_data,article_date,reply_url,article_order,platform_name,platform_form,article_form) VALUES (:URL,:PLATFORMID,:NICK,:DATA,:DATE,:URLREPLY,:ROWNUM,'naver','blog','reply')").toUtf8()); int nStartIdPos = m_strUrl.indexOf("blogId=") + QString("blogId=").size(); int nEndIdPos = m_strUrl.indexOf('&', nStartIdPos); query.bindValue(":URL", strUrl.toUtf8()); query.bindValue(":PLATFORMID",m_strUrl.mid(nStartIdPos, nEndIdPos - nStartIdPos)); query.bindValue(":NICK",strNick.toUtf8()); query.bindValue(":DATA",strComm.toUtf8()); query.bindValue(":DATE",strDate.toUtf8()); query.bindValue(":URLREPLY",m_strUrl.toUtf8()); query.bindValue(":ROWNUM",(nCount++)); if (query.exec()==false) cout << "error : " << query.lastError().text().toStdString(); } } if (element.attribute("class") == "reply _countableComment ") { strNick = Find(element,"a","class","nick pcol2").toPlainText(); strDate = Find(element,"span","class","date fil5 pcol2").toPlainText(); QWebElement subElement = Find(element,"dd","class","comm pcol2"); QString subNick = Find(subElement,"a","class","nick pcol2").toPlainText(); strComm = subElement.toPlainText(); strCommUrl = Find(element,"a","class","nick pcol2").attribute("href"); if(strCommUrl.left(QString("http://blog.naver.com").length()).compare("http://blog.naver.com") == 0) strId = strCommUrl.split("/").at(3).trimmed(); if(strCommUrl.right(QString("blog.me").length()).compare("blog.me") == 0) strId = strCommUrl.split("/").at(2).split(".").at(0).trimmed(); if(strCommUrl.left(1) == "/") { QStringList strList = strCommUrl.split("&"); foreach(QString str, strList) { if(str.left(3) == "id=") { strId = str.right(str.length() - 3); } } } if(subNick.isEmpty() == false) strComm = strComm.right(strComm.size()-subNick.size()-1); if (strComm.isEmpty() == false) { strComm = GetSafeUtf(strComm); strComm.replace("'","\\'"); strComm.replace("\"","\\\""); strComm = strComm.trimmed(); QSqlQuery query; if(strId.length() > 0) { query.prepare(QString("insert into " + m_strTable + " (article_id,article_url,platform_id,article_nickname,article_data,article_date,article_parent,reply_url,article_order,platform_name,platform_form,article_form) VALUES (:ID,:URL,:PLATFORMID,:NICK,:DATA,:DATE,:PARENT,:URLREPLY,:ROWNUM,'naver','blog','reply')").toUtf8()); query.bindValue(":ID", strId.toUtf8()); } else query.prepare(QString("insert into " + m_strTable + " (article_url,platform_id,article_nickname,article_data,article_date,article_parent,reply_url,article_order,platform_name,platform_form,article_form) VALUES (:URL,:PLATFORMID,:NICK,:DATA,:DATE,:PARENT,:URLREPLY,:ROWNUM,'naver','blog','reply')").toUtf8()); int nStartIdPos = m_strUrl.indexOf("blogId=") + QString("blogId=").size(); int nEndIdPos = m_strUrl.indexOf('&', nStartIdPos); query.bindValue(":URL",strUrl.toUtf8()); query.bindValue(":PLATFORMID",m_strUrl.mid(nStartIdPos, nEndIdPos - nStartIdPos)); query.bindValue(":NICK",strNick.toUtf8()); query.bindValue(":DATA",strComm.toUtf8()); query.bindValue(":DATE",strDate.toUtf8()); query.bindValue(":PARENT",strParent.toUtf8()); query.bindValue(":URLREPLY",m_strUrl.toUtf8()); query.bindValue(":ROWNUM",(nCount++)); if (query.exec()==false) { cout << "error : " << query.lastError().text().toStdString(); } } } } } void SCrawler::saveFrameCafeList(QWebFrame *frame) { if (m_bUse == true) return; static int cz = 0; //Debug(frame->frameName() + QString::number(cz++) + ".html",frame->toHtml()); QWebElement notFound = Find(frame->documentElement(),"div","id","notfound"); if(notFound.isNull() == false) { m_bLast = true; return; } QWebElement naverBlock = Find(frame->documentElement(), "p", "class", "info01"); if (!naverBlock.isNull()) { m_bError = true; cout << "naver"; return; } QStringList urlList; QWebElement eleMain = Find(frame->documentElement(),"div","class","cafe_article section _cafeBase"); foreach(QWebElement eleSub,eleMain.findAll("li")) { if (eleSub.attribute("class") == "sh_cafe_top") { QString strUrl; foreach(QWebElement eleSubUrl,eleSub.findAll("a")) { if (eleSubUrl.attribute("class") == "url") strUrl = eleSubUrl.attribute("href"); } if (strUrl.split("/").at(2) == "cafe.naver.com") { urlList << strUrl; } } } if(urlList.size() > 0) { QString strUrlList; strUrlList = "("; foreach(QString str, urlList) { strUrlList += "'"; strUrlList += str; strUrlList += "',"; } strUrlList = strUrlList.left(strUrlList.size() - 1); strUrlList += ")"; QSqlQuery sql; QString strQuery = "delete from "; strQuery += m_strTable; strQuery += QString(" where article_url in %1").arg(strUrlList); //qDebug() << strQuery; if (sql.exec(strQuery.toUtf8()) == false) { cout << "error " << sql.lastError().text().toStdString(); cout << strQuery.toStdString(); } } foreach(QWebElement eleSub,eleMain.findAll("li")) { if (eleSub.attribute("class") == "sh_cafe_top") { QString strUrl,strTitle; foreach(QWebElement eleSubUrl,eleSub.findAll("a")) { if (eleSubUrl.attribute("class") == "url") strUrl = eleSubUrl.attribute("href"); if (eleSubUrl.attribute("class") == "sh_cafe_title") strTitle = GetSafeUtf(SqlString(eleSubUrl.toPlainText().trimmed())); } if (strUrl.split("/").at(2) == "cafe.naver.com") cout << "o " << strUrl.toStdString() << endl; m_bUse = true; } } { QWebElement total = Find(eleMain,"span","class","title_num"); if (total.toPlainText().isEmpty()) {m_bError = true; return;} total.toPlainText().split("/").size(); int nTotal = GetNumber(total.toPlainText().split("/").at(1)); QStringList strList = m_strUrl.split("&"); int nNow = GetNumber(strList.at(strList.size() - 1).split("=").at(1)); if ((nNow + 10) > nTotal || nNow >= 1000) m_bLast = true; } } void SCrawler::saveFrameCafeUrl(QWebFrame *frame) { if (m_bUse) return; QWebElement other = Find(frame->documentElement(),"h1","class","d-none"); if (other.toPlainText().isEmpty() == false) { bodydata.setData(SqlString(GetSafeUtf(other.toPlainText())), bodydata.PLATFORM_TITLE); } if (frame->frameName() == "cafe_main") { { QString strData,strDate,strNick,strID,strHits,strTitle,strReply,strLike; { QWebElement group = Find(frame->documentElement(),"div","class","tbody m-tcol-c"); strData = SqlString(group.toPlainText().trimmed()); strData = GetSafeUtf(strData); } { QWebElement group = Find(frame->documentElement(),"td","class","m-tcol-c date"); strDate = group.toPlainText().trimmed().replace(".","-"); if (strDate.isEmpty() == true) { QWebElement subgroup = Find(frame->documentElement(),"em","class","date m-tcol-c"); strDate = subgroup.toPlainText().trimmed().replace(".","-"); strDate += " 00:00:00"; } else strDate += ":00"; } { QWebElement group = Find(frame->documentElement(),"span","class","b m-tcol-c"); strTitle = SqlString(group.toPlainText().trimmed()); } { QWebElement group = Find(Find(frame->documentElement(),"div","class","etc-box"),"td","class","p-nick"); strNick = group.toPlainText().trimmed(); if (strNick.isEmpty() == false) { QStringList list = strNick.split("("); if (list.isEmpty() == false) strNick = list.at(0); QWebElement id = Find(frame->documentElement(),"td","class","m-tcol-c b nick"); list = Find(id,"a","class","m-tcol-c b").attribute("onclick").trimmed().split(","); if (list.size() >= 2) strID = list.at(1).trimmed().replace("'",""); } else { QStringList list = Find(Find(frame->documentElement(),"td","class","m-tcol-c b nick"),"a","class","m-tcol-c b").attribute("onclick").trimmed().split(","); if (list.size() >= 4) { strID = list.at(1).trimmed().replace("'",""); strNick = list.at(3).trimmed().replace("'",""); } } } strHits = Find(frame->documentElement(),"span","class","b m-tcol-c reply _rosReadcount").toPlainText(); { strReply = frame->documentElement().findFirst("td.reply").toPlainText().replace(",", "").trimmed(); strReply = strReply.replace(QRegExp("[\\D]"), ""); } { strLike = frame->documentElement().findFirst("a#upArticleLink").toPlainText().replace(",", "").trimmed(); } if (strHits.isEmpty()) { strHits = Find(frame->documentElement(),"span","class","kin_count m-tcol-c _rosReadcount").toPlainText(); } { bodydata.setTable(m_strTable); bodydata.setData(strData, bodydata.ARTICLE_DATA); bodydata.setData(strDate, bodydata.ARTICLE_DATE); bodydata.setData(strNick, bodydata.ARTICLE_NICKNAME); bodydata.setData(strID, bodydata.ARTICLE_ID); bodydata.setData(strHits, bodydata.ARTICLE_HIT); bodydata.setData(m_strUrl, bodydata.ARTICLE_URL); bodydata.setData("naver", bodydata.PLATFORM_NAME); bodydata.setData("cafe", bodydata.PLATFORM_FORM); bodydata.setData("body", bodydata.ARTICLE_FORM); bodydata.setData(m_strUrl.split("/").at(3), bodydata.PLATFORM_ID); bodydata.setData(m_strKeywordID, bodydata.KEYWORD_ID); bodydata.setData(strReply, bodydata.ARTICLE_ORDER); bodydata.setData(strTitle, bodydata.ARTICLE_TITLE); bodydata.setData(strLike, bodydata.REPLY_URL); /* QSqlQuery sql; QString strQuery = "update "; strQuery += m_strTable; strQuery += " set "; strQuery += "article_data = '" + strData + "',"; strQuery += "article_date = '" + strDate + "',"; strQuery += "article_nickname = '" + strNick + "',"; strQuery += "article_id = '" + strID + "',"; strQuery += "article_hit = '" + strHits + "'"; strQuery += "where article_url='"; strQuery += m_strUrl; strQuery += "'"; QString strUtf8(strQuery.toUtf8()); if (sql.exec(strUtf8) == false) cout << "error : " << sql.lastError().text().toStdString(); */ } } // Comment { QWebElement group = Find(frame->documentElement(),"ul","id","cmt_list"); QWebElementCollection elements = group.findAll("li"); QString strParent; int nCount = 0; foreach (QWebElement element, elements) { if (element.attribute("class").isEmpty()) { QString strData = SqlString(Find(element,"span","class","comm_body").toPlainText().trimmed()); if (strData.isEmpty()) continue; strData = GetSafeUtf(strData); QString strID = Find(element,"input","name","writerid").attribute("value").trimmed(); QString strNick = Find(element,"td","class","p-nick").toPlainText().trimmed(); strParent = strNick; QString strDate = Find(element,"span","class","date m-tcol-c filter-50").toPlainText().trimmed(); if (strDate.isEmpty()) continue; QSqlQuery query; query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,platform_id,article_id,article_nickname,article_data,article_date,reply_url,article_order) VALUES ('naver','cafe','reply',:URL,:PLATFORMID,:ID,:NICK,:DATA,:DATE,:URLREPLY,:ROWNUM)").toUtf8()); query.bindValue(":URL",m_strUrl.toUtf8()); query.bindValue(":PLATFORMID",m_strUrl.split("/").at(3).toUtf8()); query.bindValue(":ID",strID.toUtf8()); query.bindValue(":NICK",strNick.toUtf8()); query.bindValue(":DATA",strData.toUtf8()); query.bindValue(":DATE",strDate.toUtf8()); query.bindValue(":URLREPLY",m_strReper.toUtf8()); query.bindValue(":ROWNUM",nCount++); if (query.exec()==false) cout << "error : " << query.lastError().text().toStdString(); } if (element.attribute("class") == "reply") { QString strData = Find(element,"span","class","comm_body").toPlainText().trimmed(); if (strData.isEmpty()) continue; QString strReParent = strParent; QString strID = Find(element,"input","name","writerid").attribute("value").trimmed(); QString strNick = Find(element,"td","class","p-nick").toPlainText().trimmed(); QString strDate = Find(element,"span","class","date m-tcol-c filter-50").toPlainText().trimmed(); QWebElement eleParent = Find(element,"span","class","re-p-nick"); if (eleParent.toPlainText().isEmpty() == false) strReParent = eleParent.toPlainText(); QSqlQuery query; query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,platform_id,article_id,article_nickname,article_data,article_date,article_parent,reply_url,article_order) VALUES ('naver','cafe','reply',:URL,:PLATFORMID,:ID,:NICK,:DATA,:DATE,:PARENT,:URLREPLY,:ROWNUM)").toUtf8()); query.bindValue(":URL",m_strUrl.toUtf8()); query.bindValue(":PLATFORMID",m_strUrl.split("/").at(3).toUtf8()); query.bindValue(":ID",strID.toUtf8()); query.bindValue(":NICK",strNick.toUtf8()); query.bindValue(":DATA",strData.toUtf8()); query.bindValue(":DATE",strDate.toUtf8()); query.bindValue(":PARENT",strReParent.toUtf8()); query.bindValue(":URLREPLY",m_strReper.toUtf8()); query.bindValue(":ROWNUM",nCount++); if (query.exec()==false) cout << "error : " << query.lastError().text().toStdString(); } } } m_bUse = true; } foreach(QWebFrame *childFrame, frame->childFrames()) saveFrameCafeUrl(childFrame); } void SCrawler::saveFrameDaumCafeList(QWebFrame *frame) { if (m_bUse == true) return; ///static int cz = 0; // Debug(frame->frameName() + QString::number(cz++) + ".html",frame->toHtml()); //int nLast = 0; QStringList urlList; QWebElement eleMain = Find(frame->documentElement(),"div","class","type_fulltext wid_f"); foreach(QWebElement eleSub,eleMain.findAll("div")) { if (eleSub.attribute("class") == "wrap_cont") { //nLast++; QString strUrl; foreach(QWebElement eleSubUrl,eleSub.findAll("a")) { if (eleSubUrl.attribute("class") == "f_url") strUrl = eleSubUrl.attribute("href"); } if (strUrl.split("/").at(2) == "cafe.daum.net") { urlList << strUrl; } } } if(urlList.size() > 0) { QString strUrlList; strUrlList = "("; foreach(QString str, urlList) { strUrlList += "'"; QStringList strlist = str.split("?"); if(strlist.size() > 1) strUrlList += strlist.at(0).trimmed(); else strUrlList += str; strUrlList += "',"; } strUrlList = strUrlList.left(strUrlList.size() - 1); strUrlList += ")"; QSqlQuery sql; QString strQuery = "delete from "; strQuery += m_strTable; strQuery += QString(" where article_url in %1").arg(strUrlList); // qDebug() << strQuery; if (sql.exec(strQuery.toUtf8()) == false) { cout << "error " << sql.lastError().text().toStdString(); cout << strQuery.toStdString(); } } foreach(QWebElement eleSub,eleMain.findAll("div")) { if (eleSub.attribute("class") == "wrap_cont") { //nLast++; QString strUrl,strTitle; foreach(QWebElement eleSubUrl,eleSub.findAll("a")) { if (eleSubUrl.attribute("class") == "f_url") strUrl = eleSubUrl.attribute("href"); if (eleSubUrl.attribute("class") == "f_link_bu f_l") strTitle = GetSafeUtf(SqlString(eleSubUrl.toPlainText().trimmed())); } if (strUrl.split("/").at(2) == "cafe.daum.net") { //QSqlQuery sql; { /* QString strQuery = QString("insert into "); strQuery += m_strTable; strQuery += QString(" set platform_name='daum',platform_form='cafe',article_form='body',article_url='%1',platform_id='%2',article_title='%3',keyword_id='%4'").arg(strUrl).arg(strUrl.split("/").at(3)).arg(strTitle).arg(m_strKeywordID); QString strUtf8(strQuery.toUtf8()); if (sql.exec(strUtf8) == false) cout << "x " << sql.lastError().text().toStdString(); else */ cout << "o " << strUrl.toStdString() << endl; } //else // cout << "v " << strUrl.toStdString() << endl; } m_bUse = true; } } { /* QWebElement noResult = Find(frame->documentElement(),"div","id","noResult"); if(!noResult.isNull()) m_bLast = true; */ if(eleMain.isNull()) m_bLast = true; } { QWebElement noResult = Find(frame->documentElement(),"div","id","noResult"); if(!noResult.isNull()) { m_bLast = true; return; } } { bool b_last = false; b_last = Find(frame->documentElement(), "div", "class", "result_message mg_cont hide").isNull(); b_last = b_last | !(Find(frame->documentElement(), "div", "class", "result_message mg_cont").isNull()); QWebElement total = Find(eleMain,"span","class","f_nb f_l"); if (total.toPlainText().isEmpty()) {m_bError = true; return;} QString strTotal = total.toPlainText().split("/").at(1); strTotal = strTotal.replace(",",""); QRegExp rx("(\\d+)"); int pos = 0; QList list; while ((pos = rx.indexIn(strTotal, pos)) != -1) { list << rx.cap(1); pos += rx.matchedLength(); } int nTotal = list.at(0).toInt(); QStringList strList = total.toPlainText().split("/").at(0).trimmed().split("-"); int nNow = GetNumber(strList.at(strList.size() - 1)); int nNowFirst = GetNumber(strList.at(strList.size() - 2)); if (nNow >= 1000 || nNow >= nTotal || (nNow - nNowFirst) < 9 || b_last) m_bLast = true; //cout << "nNow : " << nNow << endl << "nNow - nNowFirst: " << (nNow - nNowFirst) << endl << "b_last : " << b_last << endl; } } void SCrawler::saveFrameDaumCafeUrl(QWebFrame *frame) { if (m_bUse) return; QWebElement other = frame->documentElement().findFirst("title"); QString strTitle = other.toPlainText().trimmed().split("|").at(0).trimmed(); QString strUrl_; if (strTitle.isEmpty() == false) { bodydata.setTable(m_strTable); QStringList strlist = m_strUrl.split("?"); if(strlist.size() > 1) { bodydata.setData(strlist.at(0).trimmed(), bodydata.ARTICLE_URL); strUrl_ = strlist.at(0).trimmed(); } else { bodydata.setData(m_strUrl, bodydata.ARTICLE_URL); strUrl_ = m_strUrl; } bodydata.setData(SqlString(GetSafeUtf(strTitle)), bodydata.PLATFORM_TITLE); } if (frame->frameName() == "down") { QString strHits; { QString strData,strDate,strNick,strID,strTitle; { QWebElement group = Find(frame->documentElement(),"div","class","bbs_contents"); strData = SqlString(group.toPlainText().trimmed()); strData = GetSafeUtf(strData); } { QWebElement group = Find(frame->documentElement(),"span","class","p11 ls0"); strDate = group.toPlainText().trimmed().replace(".","-"); strDate = strDate.replace("- "," "); if (strDate.isEmpty() == true) strDate = Find(frame->documentElement(),"input","name","PLAIN_REGDT").attribute("value"); else strDate += ":00"; } { QWebElement group = Find(frame->documentElement(),"div","class","subject"); QWebElement group2 = Find(group,"span","class","b"); strTitle = SqlString(group2.toPlainText().trimmed()); } { QWebElement group = Find(Find(frame->documentElement(),"div","class","article_writer"),"a","href","#"); strNick = group.toPlainText().trimmed(); QWebElement id = Find(frame->documentElement(),"div","class","article_writer"); QStringList list = FindLeft(id,"a","onclick","showSideView").attribute("onclick").trimmed().split(","); if (list.size() >= 2) strID = list.at(1).trimmed().replace("'",""); } QStringList strList = Find(frame->documentElement(),"div","class","article_writer").toPlainText().split("|"); foreach(QString str,strList) { QStringList substrList = str.split(" "); for(int i = 0;i < substrList.size();i++) { if((substrList.at(i).trimmed().compare("조회") == 0) || (substrList.at(i).trimmed().compare("\"조회\"") == 0)) { strHits = substrList.at(i+1).trimmed(); break; } } } { bodydata.setTable(m_strTable); bodydata.setData(strData, bodydata.ARTICLE_DATA); bodydata.setData(strDate, bodydata.ARTICLE_DATE); bodydata.setData(strNick, bodydata.ARTICLE_NICKNAME); if(!strID.isEmpty()) bodydata.setData(strID, bodydata.ARTICLE_ID); bodydata.setData(strHits, bodydata.ARTICLE_HIT); QStringList strlist = m_strUrl.split("?"); if(strlist.size() > 1) { bodydata.setData(strlist.at(0).trimmed(), bodydata.ARTICLE_URL); strUrl_ = strlist.at(0).trimmed(); } else { bodydata.setData(m_strUrl, bodydata.ARTICLE_URL); strUrl_ = m_strUrl; } bodydata.setData("daum", bodydata.PLATFORM_NAME); bodydata.setData("cafe", bodydata.PLATFORM_FORM); bodydata.setData("body", bodydata.ARTICLE_FORM); bodydata.setData(m_strUrl.split("/").at(3), bodydata.PLATFORM_ID); bodydata.setData(m_strKeywordID, bodydata.KEYWORD_ID); bodydata.setData(strTitle, bodydata.ARTICLE_TITLE); } } // Comment { QWebElement group = Find(frame->documentElement(),"div","class","commentDiv bg_sub"); QList elements = FindAllMid(group,"div","id","_cmt-",0,5); QString commHidden = "comment_hidden"; QString commPos = "comment_pos"; QString commReComm = "recomment_pos"; QString strParent; int nCount = 0; foreach (QWebElement element, elements) { if (element.attribute("class").trimmed().right(commHidden.length()).compare(commHidden,Qt::CaseInsensitive) != 0){ if (element.attribute("class").trimmed().left(commPos.length()).compare(commPos,Qt::CaseInsensitive) == 0) { QString strData = SqlString(Find(element,"span","class","comment_contents").toPlainText().trimmed()); if (strData.isEmpty()) continue; strData = GetSafeUtf(strData); QString strID; QStringList strListID = Find(element,"a","class","b").attribute("onclick").split(","); if(strListID.length() > 2) strID = strListID.at(1).trimmed().replace("'",""); QString strNick = Find(element,"a","class","b").toPlainText().trimmed(); strParent = strNick; QString strDatetest = Find(element,"span","class","comment_date txt_sub p11 ls0").toPlainText().trimmed(); QString strDate; if(strDatetest.count(".") == 0) { strDate = QDateTime::currentDateTime().toString("yyyy-MM-dd"); strDate += (" " + strDatetest + ":00"); QDateTime getTime = QDateTime::fromString(strDate, "yyyy-MM-dd hh:mm:ss"); QDateTime nowTime = QDateTime::currentDateTime(); if(getTime > nowTime) { getTime.addDays(-1); strDate = getTime.toString("yyyy-MM-dd hh:mm:ss"); } } else { strDate = Find(element,"span","class","comment_date txt_sub p11 ls0").toPlainText().trimmed().replace(".","-").replace("- "," "); } if (strDate.isEmpty()) continue; QSqlQuery query; query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,article_order,platform_id,article_hit,platform_title) VALUES ('daum','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:ROWNUM,:PLATFORMID,:HITS,:TITLE)").toUtf8()); query.bindValue(":URL",strUrl_.toUtf8()); query.bindValue(":ID",strID.toUtf8()); query.bindValue(":NICK",strNick.toUtf8()); query.bindValue(":DATA",strData.toUtf8()); query.bindValue(":DATE",strDate.toUtf8()); query.bindValue(":ROWNUM",nCount++); query.bindValue(":PLATFORMID",m_strUrl.split("/").at(3).toUtf8()); query.bindValue(":HITS",strHits.toUtf8()); query.bindValue(":TITLE",strTitle.toUtf8()); if (query.exec()==false) cout << "error : " << query.lastError().text().toStdString(); } if (element.attribute("class").trimmed().left(commReComm.length()).compare(commReComm,Qt::CaseInsensitive) == 0) { QString strData = SqlString(Find(element,"span","class","comment_contents").toPlainText().trimmed()); if (strData.isEmpty()) continue; QString strReParent = Find(element,"span","class","mention_nicknames text_counter txt_point b").toPlainText().trimmed(); if(strReParent.length() == 0) strReParent = strParent; QString strID; QStringList strListID = Find(element,"a","class","b").attribute("onclick").split(","); if(strListID.length() > 2) strID = strListID.at(1).trimmed().replace("'",""); QString strNick = Find(element,"a","class","b").toPlainText().trimmed(); QString strDatetest = Find(element,"span","class","comment_date txt_sub p11 ls0").toPlainText().trimmed(); QString strDate; if(strDatetest.count(".") == 0) { strDate = QDateTime::currentDateTime().toString("yyyy-MM-dd"); strDate += (" " + strDatetest + ":00"); QDateTime getTime = QDateTime::fromString(strDate, "yyyy-MM-dd hh:mm:ss"); QDateTime nowTime = QDateTime::currentDateTime(); if(getTime > nowTime) { getTime.addDays(-1); strDate = getTime.toString("yyyy-MM-dd hh:mm:ss"); } } else { strDate = Find(element,"span","class","comment_date txt_sub p11 ls0").toPlainText().trimmed().replace(".","-").replace("- "," "); } if (strDate.isEmpty()) continue; QSqlQuery query; query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,article_parent,article_order,platform_id,article_hit,platform_title) VALUES ('daum','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:PARENT,:ROWNUM,:PLATFORMID,:HITS,:TITLE)").toUtf8()); query.bindValue(":URL",strUrl_.toUtf8()); query.bindValue(":ID",strID.toUtf8()); query.bindValue(":NICK",strNick.toUtf8()); query.bindValue(":DATA",strData.toUtf8()); query.bindValue(":DATE",strDate.toUtf8()); query.bindValue(":PARENT",strReParent.toUtf8()); //query.bindValue(":URLREPLY",m_strReper.toUtf8()); query.bindValue(":ROWNUM",nCount++); query.bindValue(":PLATFORMID",m_strUrl.split("/").at(3).toUtf8()); query.bindValue(":HITS",strHits.toUtf8()); query.bindValue(":TITLE",strTitle.toUtf8()); //QWebView::page()->mainFrame()->evaluateJavaScript(""); if (query.exec()==false) cout << "error : " << query.lastError().text().toStdString(); } } } } m_bUse = true; } foreach(QWebFrame *childFrame, frame->childFrames()) saveFrameDaumCafeUrl(childFrame); } void SCrawler::saveFrameDaumBlogUrl(QWebFrame *frame){} void SCrawler::saveFrameDaumBlogComment(QWebFrame *frame){} void SCrawler::saveFrameDaumBlogList(QWebFrame *frame){} void SCrawler::saveFrameNewsList(QWebFrame *frame) { if (m_bUse == true) return; QWebElement notFound = Find(frame->documentElement(),"div","class","no_content"); if(notFound.isNull() == false) { m_bLast = true; return; } QWebElement eleMain = Find(frame->documentElement(),"div","class","srch_result_area headline"); foreach(QWebElement eleSub,eleMain.findAll("div")) { if (eleSub.attribute("class") == QString("info")) { QString str = Find(eleSub,"a","class","go_naver").attribute("href"); if (str.trimmed().isEmpty()) continue; if (str.contains("http://sports")) continue; m_bNothing = true; cout << "o " << str.toStdString() << endl; } } QStringList strTotal = bodydata.GetNumber(Find(frame->documentElement(),"span","class","result_num").toPlainText().trimmed()); QVector vecTotal; foreach(QString str,strTotal) { if (str.trimmed().isEmpty() == false) vecTotal.push_back(str.toInt()); } if (vecTotal.size() == 3) { if (vecTotal[0] >= vecTotal[1]) m_bLast = true; if (vecTotal[1] == vecTotal[2]) m_bLast = true; } else m_bError = true; m_bUse = true; } bool SCrawler::saveFrameNewsUrl(QWebFrame *frame) { if (m_bUse) return true; { QString strQuery = "delete from "; strQuery += m_strTable + " where article_url = '"; strQuery += m_strUrl + "'"; QSqlQuery query; if(query.exec(strQuery.toUtf8()) == false) { cout << query.lastError().text().toStdString(); cout << query.lastQuery().toStdString(); } } QString strTitle,strDate,strData,strPlatID,strPlatTitle,strlike,strReply,strSympathy; { QWebElement element = Find(frame->documentElement(),"div","class","article_info"); { strTitle = Find(element,"h3","id","articleTitle").toPlainText(); // Title; strDate = Find(element,"span","class","t11").toPlainText(); // Date } strData = Find(frame->documentElement(),"div","id","articleBodyContents").toPlainText(); strlike = Find(frame->documentElement(),"div","class","u_likeit_module").toPlainText(); strReply = Find(frame->documentElement(), "span", "class", "lo_txt").toPlainText().replace(QRegExp("[\\D]"), "").trimmed(); // normal strSympathy = frame->documentElement().findFirst("em.u_cnt").toPlainText().replace(QRegExp("[\\D]"), "").trimmed(); if (strReply.isEmpty()) { strReply = Find(frame->documentElement(), "a", "class", "reply_count").toPlainText().replace(QRegExp("[\\D]"), "").trimmed(); // entertain } if (strReply.isEmpty()) { strReply = Find(frame->documentElement(), "span", "id", "newsCommentCount").toPlainText().replace(QRegExp("[\\D]"), "").trimmed(); // sports } if (strReply.isEmpty()) { strReply = Find(frame->documentElement(), "span", "class", "u_cbox_count").toPlainText().replace(QRegExp("[\\D]"), "").trimmed(); } qDebug() << strReply << ", " << strSympathy; if ((strReply.isEmpty() || (strReply == "0") || strSympathy.isEmpty() || (strSympathy == "0")) && (++m_nRetryCount < RETRY_MAX)) { //qDebug() << "singleshot"; QTimer::singleShot(1000, this, SLOT(reloadPage())); return false; } if (strSympathy.isEmpty()) strSympathy = "0"; //e​ntertainment if (strTitle.isEmpty()) { QWebElement elementTitle = Find(frame->documentElement(),"div","class","end_ct_area"); strTitle = Find(elementTitle,"p","class","end_tit").toPlainText(); } //e​ntertainment if (strDate.isEmpty()) strDate = Find(element,"em").toPlainText(); if (strData.isEmpty()) strData = Find(frame->documentElement(),"div","id","articeBody").toPlainText(); if (strlike.isEmpty() == false) strData += "\r\nlike(" + QString::number(strlike.toInt()) + ")"; else { //Debug("out.html",m_page->mainFrame()->toHtml()); } //sport { if(strTitle.isEmpty()) { strTitle = Find(frame->documentElement(),"h4","class","title").toPlainText(); } if(strData.isEmpty()) { strData = Find(frame->documentElement(), "div", "id", "newsEndContents").toPlainText(); QString strSpam = Find(frame->documentElement(), "div", "class", "link_news").toPlainText(); QString strSource = Find(frame->documentElement(), "p", "class", "source").toPlainText(); strData = strData.left(strData.length() - strSpam.length() - strSource.length()); } if(strDate.isEmpty()) { strDate = frame->documentElement().findFirst("div[class='info']>span").toPlainText(); QRegExp reDate("([\\d]{4}).([\\d]{2}).([\\d]{2})"); QRegExp reTime("([\\d]{2}):([\\d]{2})"); int pos = 0; QString date; QString time; while((pos = reDate.indexIn(strDate, pos)) != -1) { date = reDate.cap(1) + "-" + reDate.cap(2) + "-" + reDate.cap(3); break; } pos = 0; while((pos = reTime.indexIn(strDate, pos)) != -1) { if(strDate.contains("오후") && (reTime.cap(1) != "12")) time = QString::number(reTime.cap(1).toInt() + 12) + ":" + reTime.cap(2); else time = reTime.cap(1) + ":" + reTime.cap(2); break; } strDate = date + " " + time + ":00"; } } element = Find(frame->documentElement(),"div","class","press_logo"); if(!element.isNull()) { strPlatID = Find(element,"a").attribute("href"); strPlatTitle = Find(element,"img").attribute("alt"); QStringList strlistPlat = strPlatID.split("."); if(strlistPlat.size() > 2) { if (strlistPlat.at(0) == QString("http://www")) strPlatID = strlistPlat.at(1); } } else //sports { element = frame->documentElement().findFirst("span[class='logo']>img"); strPlatTitle = element.attribute("alt"); QWebElement link = Find(frame->documentElement(), "a", "class", "press_link"); QString strLink = link.attribute("href"); strPlatID = strLink.left(strLink.mid(8).indexOf('/') + 7); QStringList strlistPlat = strPlatID.split("."); if(strlistPlat.size() > 2) { if (strlistPlat.at(0) == QString("http://www")) strPlatID = strlistPlat.at(1); } } } bodydata.setTable(m_strTable); bodydata.setData(bodydata.GetSafeUtf(strTitle), SCrawlerData::ARTICLE_TITLE); bodydata.setData(bodydata.GetSafeUtf(strData), SCrawlerData::ARTICLE_DATA); bodydata.setData(strPlatID,SCrawlerData::PLATFORM_ID); bodydata.setData(strPlatTitle,SCrawlerData::PLATFORM_TITLE); bodydata.setData(strDate, SCrawlerData::ARTICLE_DATE); bodydata.setData("naver", SCrawlerData::PLATFORM_NAME); bodydata.setData("news", SCrawlerData::PLATFORM_FORM); bodydata.setData("body", SCrawlerData::ARTICLE_FORM); bodydata.setData(m_strUrl, SCrawlerData::ARTICLE_URL); bodydata.setData(m_strKeywordID, SCrawlerData::KEYWORD_ID); bodydata.setData(strReply, SCrawlerData::ARTICLE_ORDER); bodydata.setData(strSympathy, SCrawlerData::REPLY_URL); m_bUse = true; return true; } bool SCrawler::saveFrameNewsComment(QWebFrame *frame) { if (m_bUse) return true; static bool bReplyDone = false; //static int reply_index = 0; static int iLoaded = 0; static bool bProcessed = false; if (bProcessed) return false; bProcessed = true; //qDebug() << frame->baseUrl().toString(); //qDebug() << "executed"; if(frame->baseUrl().toString().contains("entertain") && !frame->baseUrl().toString().contains("comment")) { m_page->mainFrame()->load(QUrl(frame->baseUrl().toString().replace("read", "comment/list"))); bProcessed = false; return false; } if(frame->baseUrl().toString().contains("sports") && !frame->baseUrl().toString().contains("m_view=1")) { m_page->mainFrame()->load(QUrl(frame->baseUrl().toString() + "&m_view=1")); bProcessed = false; return false; } if(m_nRetryCount < RETRY_MAX && !bReplyDone) { QWebElement u_cbox_paginate = Find(frame->documentElement(), "div", "class", "u_cbox_paginate"); if (u_cbox_paginate.isNull()) { ++m_nRetryCount; // qDebug() << m_nRetryCount; QTimer::singleShot(RETRY_INTERVAL, this, SLOT(reloadPage())); bProcessed = false; return false; } else { QWebElement a = Find(frame->documentElement(), "a", "class", "u_cbox_btn_more __cbox_page_button"); if(!a.isNull()) { a.evaluateJavaScript("var evObj = new Event('click', {bubbles: true, cancelable: true, view: window}); this.dispatchEvent(evObj);"); QTimer::singleShot(300, this, SLOT(reloadPage())); // qDebug() << "load comments"; //QWebElement a = Find(frame->documentElement(), "a", "class", "u_cbox_btn_more __cbox_page_button"); QWebElementCollection lis = frame->findAllElements("div[class='u_cbox_content_wrap']>ul[class='u_cbox_list']>li"); // qDebug() << lis.count(); if (lis.count() != iLoaded) { iLoaded = lis.count(); bProcessed = false; return false; } else { bReplyDone = true; } } } } /* QWebElement u_cbox_paginate = Find(frame->documentElement(), "div", "class", "u_cbox_paginate"); //qDebug() << lis.count(); if (!u_cbox_paginate.isNull()) { QWebElement a = Find(frame->documentElement(), "a", "class", "u_cbox_btn_more __cbox_page_button"); //QWebElementCollection lis = frame->findAllElements("div[class='u_cbox_content_wrap']>ul[class='u_cbox_list']>li"); if(!a.isNull()) { a.evaluateJavaScript("var evObj = new Event('click', {bubbles: true, cancelable: true, view: window}); this.dispatchEvent(evObj);"); QTimer::singleShot(300, this, SLOT(reloadPage())); qDebug() << "load comments"; //QWebElement a = Find(frame->documentElement(), "a", "class", "u_cbox_btn_more __cbox_page_button"); QWebElementCollection lis = frame->findAllElements("div[class='u_cbox_content_wrap']>ul[class='u_cbox_list']>li"); qDebug() << lis.count(); if (lis.count() != iLoaded) { iLoaded = lis.count(); bProcessed = false; return false; } } //return false; while(!bReplyDone) { QWebElement current = Find(a, "em", "class", "u_cbox_page_on __cbox_page_current"); QWebElement total = Find(a, "em", "class", "u_cbox_page_total __cbox_page_total"); QString str_current = current.toPlainText(); QString str_total = total.toPlainText(); bool ok; int n_current = str_current.replace(",", "").toInt(&ok); if(!ok) break; int n_total = str_total.replace(",", "").toInt(&ok); if(!ok) break; if(n_current >= n_total) { bReplyDone = true; break; } a.evaluateJavaScript("var evObj = new Event('click', {bubbles: true, cancelable: true, view: window}); this.dispatchEvent(evObj);"); QTimer::singleShot(300, this, SLOT(reloadPage())); qDebug() << "load comments"; return false; } QWebElementCollection reply_btns = frame->findAllElements("a[class='u_cbox_btn_reply']"); for(;reply_index < reply_btns.count() ; reply_index++) { QWebElement btn = Find(reply_btns[reply_index], "span", "class", "u_cbox_reply_cnt"); if(btn.isNull()) continue; else { btn.evaluateJavaScript("var evObj = new Event('click', {bubbles: true, cancelable: true, view: window}); this.dispatchEvent(evObj);"); QTimer::singleShot(200, this, SLOT(reloadPage())); //reply_index += 1; qDebug() << reply_index; return false; } } */ /* foreach(QWebElement a, reply_btns) { QWebElement btn = Find(a, "span", "class", "u_cbox_reply_cnt"); if(btn.isNull()) continue; else { btn.evaluateJavaScript("var evObj = new Event('click', {bubbles: true, cancelable: true, view: window}); this.dispatchEvent(evObj);"); //QTimer::singleShot(100, this, SLOT(reloadPage())); qDebug() << "qq"; //return false; } } } */ /* else { if(m_nRetryCount < RETRY_MAX) { m_nRetryCount++; qDebug() << m_nRetryCount; QTimer::singleShot(RETRY_INTERVAL, this, SLOT(reloadPage())); bProcessed = false; return false; } */ /* else { m_bUse = true; return true; } */ //} /* { QWebElementCollection lis = frame->findAllElements("div[class='u_cbox_content_wrap']>ul[class='u_cbox_list']>li"); foreach (QWebElement li, lis) { QWebElement btn = li.findFirst("span[class='u_cbox_reply_cnt']"); QWebElement atag = li.findFirst("a[class='u_cbox_btn_reply']"); if (!btn.isNull() && !atag.isNull()) { atag.evaluateJavaScript("var evObj = new Event('click', {bubbles: true, cancelable: true, view: window}); this.dispatchEvent(evObj);"); QTimer::singleShot(300, this, SLOT(reloadPage())); qDebug() << "click reply:" << btn.toPlainText(); bProcessed = false; return false; } QWebElement div_load_more = li.findFirst("div[class='u_cbox_paginate']"); if (!div_load_more.isNull()) { QWebElement load_more = div_load_more.findFirst("a[class='u_cbox_btn_more __cbox_page_button']"); if (!load_more.isNull()) { load_more.evaluateJavaScript("var evObj = new Event('click', {bubbles: true, cancelable: true, view: window}); this.dispatchEvent(evObj);"); QTimer::singleShot(300, this, SLOT(reloadPage())); qDebug() << "load more reply"; bProcessed = false; return false; } } } } */ QWebElementCollection reply_btns = frame->findAllElements("a[class^='u_cbox_btn_reply']"); foreach (QWebElement ele, reply_btns) { QWebElement btn = ele.findFirst("span[class='u_cbox_reply_cnt']"); if ((ele.attribute("class") == "u_cbox_btn_reply") && !btn.isNull()) { ele.evaluateJavaScript("var evObj = new Event('click', {bubbles: true, cancelable: true, view: window}); this.dispatchEvent(evObj);"); QTimer::singleShot(250, this, SLOT(reloadPage())); //qDebug() << "click reply:" << btn.toPlainText(); bProcessed = false; return false; } } QWebElementCollection allPaginate = frame->documentElement().findAll("div[class='u_cbox_paginate']"); foreach (QWebElement ele, allPaginate) { QWebElement load_more = ele.findFirst("a[class='u_cbox_btn_more __cbox_page_button']"); if (!load_more.isNull()) { load_more.evaluateJavaScript("var evObj = new Event('click', {bubbles: true, cancelable: true, view: window}); this.dispatchEvent(evObj);"); QTimer::singleShot(250, this, SLOT(reloadPage())); //qDebug() << "load more reply"; bProcessed = false; return false; } } /* //for(;reply_index < reply_btns.count() ;) for (int k = 0; k < reply_btns.count(); ++k) { //QWebElement btn = Find(reply_btns[reply_index], "span", "class", "u_cbox_reply_cnt"); QWebElement btn = Find(reply_btns[k], "span", "class", "u_cbox_reply_cnt"); //reply_index += 1; if(btn.isNull()) continue; else { //QWebElement btnA = Find(reply_btns[reply_index - 1], "a", "class", "u_cbox_btn_reply"); reply_btns[k].evaluateJavaScript("var evObj = new Event('click', {bubbles: true, cancelable: true, view: window}); this.dispatchEvent(evObj);"); qDebug() << "load??????????????"; qDebug() << reply_btns.count(); QTimer::singleShot(300, this, SLOT(reloadPage())); bProcessed = false; return false; } } */ { QWebElement logo = Find(frame->documentElement(),"div","class","press_logo"); QString strPlatID, strPlatTitle; { strPlatID = Find(logo,"a").attribute("href"); strPlatTitle = Find(logo,"img").attribute("alt"); } QStringList strlistPlat = strPlatID.split("."); if(strlistPlat.size() > 2) { if (strlistPlat.at(0) == QString("http://www")) strPlatID = strlistPlat.at(1); } //QWebElement ul = frame->findFirstElement("div[class='u_cbox_content_wrap']>ul[class='u_cbox_list']"); QWebElementCollection lis = frame->findAllElements("div[class='u_cbox_content_wrap']>ul[class='u_cbox_list']>li"); int order = 0; foreach(QWebElement li, lis) { //qDebug() << "li"; QWebElement comment_box = li.findFirst("div[class='u_cbox_comment_box']"); QString strParent; { QString strID, strNick, strData, strLike, strDislike, strDate; strData = Find(comment_box, "span", "class", "u_cbox_contents").toPlainText(); strNick = strParent = strID = Find(comment_box, "span", "class", "u_cbox_name").toPlainText(); strLike = Find(comment_box, "em", "class", "u_cbox_cnt_recomm").toPlainText().replace(",", ""); strDislike = Find(comment_box, "em", "class", "u_cbox_cnt_unrecomm").toPlainText().replace(",", ""); strData += "\n(goodCount:" + strLike +")\n(badCount:" + strDislike + ")"; strDate = Find(comment_box, "span", "class", "u_cbox_date").toPlainText(); if(strDate.contains(":")) strDate += ":00"; else { QDateTime current_time = QDateTime::currentDateTime(); QRegExp rx("(\\d+)"); int pos = 0; QString strTime; while ((pos = rx.indexIn(strDate, pos)) != -1) { strTime = rx.cap(1); pos += rx.matchedLength(); } if(strDate.contains("시간")) { current_time = current_time.addSecs(-(60 * 60 * strTime.toInt())); } else if(strDate.contains("일")) { current_time = current_time.addDays(-(strTime.toInt())); } else if(strDate.contains("분")) { current_time = current_time.addSecs(-(60 * strTime.toInt())); } else { } strDate = current_time.toString("yyyy-MM-dd hh:mm:ss"); // qDebug() << strDate; } { QSqlQuery query; query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_order,platform_id,platform_title, article_date) " "VALUES ('naver','news','reply',:URL,:ID,:NICK,:DATA,:ROWNUM,:PLATFORMID,:TITLE,:DATE)").toUtf8()); query.bindValue(":URL",m_strUrl.replace("&m_view=1","").toUtf8()); query.bindValue(":ID",strID.toUtf8()); query.bindValue(":NICK",strNick.toUtf8()); query.bindValue(":DATA",strData.toUtf8()); query.bindValue(":ROWNUM",order++); query.bindValue(":PLATFORMID",strPlatID.toUtf8()); query.bindValue(":TITLE",strPlatTitle.toUtf8()); query.bindValue(":DATE", strDate.toUtf8()); if (query.exec()==false) cout << "error : " << query.lastError().text().toStdString(); } } QWebElement reply_area = li.findFirst("div[class='u_cbox_reply_area']"); QWebElementCollection sub_lis = reply_area.findAll("ul[class='u_cbox_list']>li"); foreach(QWebElement sub_li, sub_lis) { QString strID, strNick, strData, strDate; strData = Find(sub_li, "span", "class", "u_cbox_contents").toPlainText(); strNick = strID = Find(sub_li, "span", "class", "u_cbox_name").toPlainText(); strDate = Find(sub_li, "span", "class", "u_cbox_date").toPlainText(); if(strDate.contains(":")) strDate += ":00"; else { QDateTime current_time = QDateTime::currentDateTime(); QRegExp rx("(\\d+)"); int pos = 0; QString strTime; while ((pos = rx.indexIn(strDate, pos)) != -1) { strTime = rx.cap(1); pos += rx.matchedLength(); } if(strDate.contains("시간")) { current_time = current_time.addSecs(-(60 * 60 * strTime.toInt())); } else if(strDate.contains("일")) { current_time = current_time.addDays(-(strTime.toInt())); } else if(strDate.contains("분")) { current_time = current_time.addSecs(-(60 * strTime.toInt())); } else { ; } strDate = current_time.toString("yyyy-MM-dd hh:mm:ss"); } { QSqlQuery query; query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_order,platform_id,platform_title, article_date, article_parent) " "VALUES ('naver','news','reply',:URL,:ID,:NICK,:DATA,:ROWNUM,:PLATFORMID,:TITLE,:DATE,:PARENT)").toUtf8()); query.bindValue(":URL",m_strUrl.replace("&m_view=1","").toUtf8()); query.bindValue(":ID",strID.toUtf8()); query.bindValue(":NICK",strNick.toUtf8()); query.bindValue(":DATA",strData.toUtf8()); query.bindValue(":ROWNUM",order++); query.bindValue(":PLATFORMID",strPlatID.toUtf8()); query.bindValue(":TITLE",strPlatTitle.toUtf8()); query.bindValue(":DATE", strDate.toUtf8()); query.bindValue(":PARENT", strParent.toUtf8()); if (query.exec()==false) cout << "error : " << query.lastError().text().toStdString(); } } } qDebug() << "lis count: " << lis.count(); } //Debug("c:\\data\\replytest.html", frame->toHtml()); m_bUse = true; bProcessed = false; return true; } QWebElement SCrawler::Find(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind) { QWebElementCollection elements = _FindElement.findAll(_strElement); foreach (QWebElement element, elements) { if (element.attribute(_strAttrib) == _strFind) { return element; } } QWebElement element; return element; } QWebElement SCrawler::FindMid(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind, const int _strStart, const int _strLength) { QWebElementCollection elements = _FindElement.findAll(_strElement); foreach (QWebElement element, elements) { QString str = element.attribute(_strAttrib).trimmed().mid(_strStart,_strLength); if (QString::compare(str,_strFind,Qt::CaseInsensitive)==0) { return element; } } QWebElement element; return element; } QWebElement SCrawler::FindMid(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind, const int _strStart) { int _strLength = _strFind.length(); return FindMid(_FindElement, _strElement, _strAttrib, _strFind, _strStart, _strLength); } QWebElement SCrawler::FindRight(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind) { QWebElementCollection elements = _FindElement.findAll(_strElement); foreach (QWebElement element, elements) { QString str = element.attribute(_strAttrib).trimmed().right(_strFind.length()); cout << "FindRight : " << str.toStdString() << endl; cout << "FindRight right : " << _strFind.toStdString() << endl; if (QString::compare(str,_strFind,Qt::CaseInsensitive)==0) { return element; } } QWebElement element; return element; } QWebElement SCrawler::FindLeft(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind) { int _strStart = 0; return FindMid(_FindElement, _strElement, _strAttrib, _strFind, _strStart); } QList SCrawler::FindAllMid(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind, const int _strStart, const int _strLength) { QWebElementCollection elements = _FindElement.findAll(_strElement); QList returnElements = QList(); foreach (QWebElement element, elements) { QString str = element.attribute(_strAttrib).trimmed().mid(_strStart,_strLength); if (QString::compare(str,_strFind,Qt::CaseInsensitive)==0) { returnElements.append(element); } } return returnElements; } bool SCrawler::getProxyList(QString &_str) { QSqlQuery sqlquery; QString strquery = "select proxy, port from Proxy limit 300"; QString queryutf = strquery.toUtf8(); if(sqlquery.exec(queryutf) == false) { return false; } while(sqlquery.next()) { QString str = sqlquery.value(0).toString(); str += ","; str += sqlquery.value(1).toString(); str += "\n"; _str += str; } return true; } bool SCrawler::setProxyFromFile() { QFile file("proxy.txt"); QRegExp rx("^\\s*([\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3})[^\\d]*([\\d]*)"); if (file.open(QIODevice::ReadOnly | QIODevice::Text)) { QVector vecProxy; while (!file.atEnd()) { QString str = QString(file.readLine()); if (str.isEmpty()) continue; int pos = 0; QStringList strList; while ((pos = rx.indexIn(str, pos)) != -1) { if (!rx.cap(1).isEmpty()) strList.append(rx.cap(1)); if (!rx.cap(2).isEmpty()) strList.append(rx.cap(2)); pos += rx.matchedLength(); } if (!strList.isEmpty()) vecProxy.push_back(strList); } if (vecProxy.size() > 0) { QStringList strList = vecProxy.at(rand()%vecProxy.size()); //QNetworkAccessManager *manager = new QNetworkAccessManager; switch(strList.size()) { case 1: cout << "p : " << strList.at(0).toStdString() << " from File" << endl; //manager->setProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0)))); //m_page->setNetworkAccessManager(manager); QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0)))); break; case 2: cout << "p : " << strList.at(0).toStdString() << ":" << strList.at(1).toStdString() << " from File" << endl; //manager->setProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt()))); //m_page->setNetworkAccessManager(manager); QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt()))); break; } } else { return false; } file.close(); return true; } else return false; } bool SCrawler::setProxyFromDb() { QString proxyList; if (getProxyList(proxyList)) { QVector vecProxy; QStringList strListProxy = proxyList.split("\n"); foreach(QString str, strListProxy) { str = str.trimmed(); if (str.isEmpty()) continue; vecProxy.push_back(str.split(",")); } if (vecProxy.size() > 0) { QStringList strList = vecProxy.at(rand()%vecProxy.size()); switch(strList.size()) { case 1: cout << "p : " << strList.at(0).toStdString() << " from DB" << endl; QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0)))); break; case 2: cout << "p : " << strList.at(0).toStdString() << ":" << strList.at(1).toStdString() << " from DB" << endl; m_strProxyIP = strList.at(0); m_nProxyPort = strList.at(1).toInt(); QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt()))); /* QString strProxyHost = "61.103.7.74"; int nPort = 2074; QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strProxyHost,nPort))); */ break; } return true; } else { return false; } } else { return false; } } void SCrawler::setProxy() { bool ok = setProxyFromFile() || setProxyFromDb(); //bool ok = false; if (!ok) cout << "No Proxy" << endl; } void SCrawler::deleteProxy() { if (m_strProxyIP.isEmpty()) return; QSqlQuery sqlquery; QString strquery = "delete from Proxy where proxy = '" + m_strProxyIP + "' and port = " + QString::number(m_nProxyPort); if(sqlquery.exec(strquery.toUtf8()) == false) { cout << "Error : " << strquery.toStdString() << endl; cout << sqlquery.lastError().text().toStdString() << endl; } }