diff --git a/CrawlerProcess/scrawler.cpp b/CrawlerProcess/scrawler.cpp index d8f8a39..41e6141 100644 --- a/CrawlerProcess/scrawler.cpp +++ b/CrawlerProcess/scrawler.cpp @@ -61,35 +61,8 @@ void SCrawler::load(QStringList _strlistArgv) break; } } - } - QString proxyList; - if (getProxyList(proxyList)) - { - QVector vecProxy; - QStringList strListProxy = proxyList.split("\n"); - foreach(QString str, strListProxy) - { - str = str.trimmed(); - if (str.isEmpty()) continue; - vecProxy.push_back(str.split(",")); - } - if (vecProxy.size() > 0) - { - QStringList strList = vecProxy.at(rand()%vecProxy.size()); - switch(strList.size()) - { - case 1: - cout << "p : " << strList.at(0).toStdString() << endl; - QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0)))); - break; - case 2: - cout << "p : " << strList.at(0).toStdString() << ", " << strList.at(1).toStdString() << endl; - QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt()))); - break; - } - } - } - */ + }*/ + setProxy(); } @@ -106,34 +79,7 @@ void SCrawler::load(QStringList _strlistArgv) m_strUrl = _strlistArgv[2]; m_nSelect = E_NAVER_BLOG_LIST; m_strKeywordID = _strlistArgv[4]; - //cout << "ok"; - /* - QFile file("proxy.txt"); - if (file.open(QIODevice::ReadOnly | QIODevice::Text)) - { - QVector vecProxy; - while (!file.atEnd()) - { - QString str = QString(file.readLine()); - if (str.isEmpty()) continue; - vecProxy.push_back(str.split(",")); - } - if (vecProxy.size() > 0) - { - QStringList strList = vecProxy.at(rand()%vecProxy.size()); - switch(strList.size()) - { - case 1: - cout << "p : " << strList.at(0).toStdString() << endl; - QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0)))); - break; - case 2: - cout << "p : " << strList.at(0).toStdString() << endl; - QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt()))); - break; - } - } - }*/ + setProxy(); } @@ -162,33 +108,6 @@ void SCrawler::load(QStringList _strlistArgv) m_nSelect = E_DAUM_CAFE_LIST; m_strKeywordID = _strlistArgv[4]; setProxy(); - /* - QFile file("proxy.txt"); - if (file.open(QIODevice::ReadOnly | QIODevice::Text)) - { - QVector vecProxy; - while (!file.atEnd()) - { - QString str = QString(file.readLine()); - if (str.isEmpty()) continue; - vecProxy.push_back(str.split(",")); - } - if (vecProxy.size() > 0) - { - QStringList strList = vecProxy.at(rand()%vecProxy.size()); - switch(strList.size()) - { - case 1: - cout << "p : " << strList.at(0).toStdString() << endl; - QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0)))); - break; - case 2: - cout << "p : " << strList.at(0).toStdString() << endl; - QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt()))); - break; - } - } - }*/ } if (_strlistArgv[1] == "cafe_data") @@ -204,33 +123,7 @@ void SCrawler::load(QStringList _strlistArgv) m_nSelect = E_DAUM_BLOG_LIST; m_strKeywordID = _strlistArgv[4]; //cout << "ok"; - - QFile file("proxy.txt"); - if (file.open(QIODevice::ReadOnly | QIODevice::Text)) - { - QVector vecProxy; - while (!file.atEnd()) - { - QString str = QString(file.readLine()); - if (str.isEmpty()) continue; - vecProxy.push_back(str.split(",")); - } - if (vecProxy.size() > 0) - { - QStringList strList = vecProxy.at(rand()%vecProxy.size()); - switch(strList.size()) - { - case 1: - cout << "p : " << strList.at(0).toStdString() << endl; - QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0)))); - break; - case 2: - cout << "p : " << strList.at(0).toStdString() << endl; - QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt()))); - break; - } - } - } + setProxy(); } if (_strlistArgv[1] == "blog_url") @@ -697,9 +590,9 @@ void SCrawler::saveFrameComment(QWebFrame *frame) { QWebElement group = Find(frame->documentElement(),"ul","id","commentList"); QWebElementCollection elements = group.findAll("li"); - QString strParent,strDate,strNick,strComm,strUrl; + QString strParent,strDate,strNick,strComm,strUrl,strId; QStringList strList = m_strUrl.split("/"); - + QString strCommUrl; for (int i=0; i < 5; i++) strUrl += strList.at(i) + "/"; @@ -713,6 +606,13 @@ void SCrawler::saveFrameComment(QWebFrame *frame) strNick = strParent = GetSafeUtf(Find(element,"a","class","nick pcol2").toPlainText()); strDate = Find(element,"span","class","date fil5 pcol2").toPlainText(); strComm = Find(element,"dd","class","comm pcol2").toPlainText(); + + strCommUrl = Find(element,"a","class","nick pcol2").attribute("href"); + if(strCommUrl.left(QString("http://blog.naver.com").length()).compare("http://blog.naver.com") == 0) + strId = strCommUrl.split("/").at(3).trimmed(); + if(strCommUrl.right(QString("blog.me").length()).compare("blog.me") == 0) + strId = strCommUrl.split("/").at(2).split(".").at(0).trimmed(); + strComm = GetSafeUtf(strComm); if (strComm.isEmpty()== false) { @@ -721,7 +621,13 @@ void SCrawler::saveFrameComment(QWebFrame *frame) strComm = strComm.trimmed(); //cout << strComm.toStdString() << endl; QSqlQuery query; - query.prepare(QString("insert into " + m_strTable + " (article_url,article_nickname,article_data,article_date,reply_url,article_order,platform_name,platform_form,article_form) VALUES (:URL,:NICK,:DATA,:DATE,:URLREPLY,:ROWNUM,'naver','blog','reply')").toUtf8()); + if(strId.length() > 0) + { + query.prepare(QString("insert into " + m_strTable + " (article_id,article_url,platform_id,article_nickname,article_data,article_date,reply_url,article_order,platform_name,platform_form,article_form) VALUES (:ID,:URL,:PLATFORMID,:NICK,:DATA,:DATE,:URLREPLY,:ROWNUM,'naver','blog','reply')").toUtf8()); + query.bindValue(":ID", strId.toUtf8()); + } + else + query.prepare(QString("insert into " + m_strTable + " (article_url,platform_id,article_nickname,article_data,article_date,reply_url,article_order,platform_name,platform_form,article_form) VALUES (:URL,:PLATFORMID,:NICK,:DATA,:DATE,:URLREPLY,:ROWNUM,'naver','blog','reply')").toUtf8()); /* cout << "m_strTable = " << m_strTable.toStdString() << endl; cout << "url = " << strUrl.toStdString() << endl << "nickname = " << strNick.toStdString() << endl; @@ -730,7 +636,9 @@ void SCrawler::saveFrameComment(QWebFrame *frame) cout << "urlReply = " << m_strUrl.toStdString() << endl; cout << "ronum = " << nCount << endl; */ + query.bindValue(":URL", strUrl.toUtf8()); + query.bindValue(":PLATFORMID",strUrl.split("/").at(3).toUtf8()); query.bindValue(":NICK",strNick.toUtf8()); query.bindValue(":DATA",strComm.toUtf8()); query.bindValue(":DATE",strDate.toUtf8()); @@ -748,7 +656,14 @@ void SCrawler::saveFrameComment(QWebFrame *frame) strDate = Find(element,"span","class","date fil5 pcol2").toPlainText(); QWebElement subElement = Find(element,"dd","class","comm pcol2"); QString subNick = Find(subElement,"a","class","nick pcol2").toPlainText(); - strComm = subElement.toPlainText(); + strComm = subElement.toPlainText(); + + strCommUrl = Find(element,"a","class","nick pcol2").attribute("href"); + if(strCommUrl.left(QString("http://blog.naver.com").length()).compare("http://blog.naver.com") == 0) + strId = strCommUrl.split("/").at(3).trimmed(); + if(strCommUrl.right(QString("blog.me").length()).compare("blog.me") == 0) + strId = strCommUrl.split("/").at(2).split(".").at(0).trimmed(); + if(subNick.isEmpty() == false) { strNick = strParent; @@ -771,9 +686,16 @@ void SCrawler::saveFrameComment(QWebFrame *frame) cout << "ronum = " << nCount << endl; cout << "parent = " << strParent.toStdString() << endl; */ + if(strId.length() > 0) + { + query.prepare(QString("insert into " + m_strTable + " (article_id,article_url,platform_id,article_nickname,article_data,article_date,reply_url,article_order,platform_name,platform_form,article_form) VALUES (:ID,:URL,:PLATFORMID,:NICK,:DATA,:DATE,:URLREPLY,:ROWNUM,'naver','blog','reply')").toUtf8()); + query.bindValue(":ID", strId.toUtf8()); + } + else + query.prepare(QString("insert into " + m_strTable + " (article_url,platform_id,article_nickname,article_data,article_date,reply_url,article_order,platform_name,platform_form,article_form) VALUES (:URL,:PLATFORMID,:NICK,:DATA,:DATE,:URLREPLY,:ROWNUM,'naver','blog','reply')").toUtf8()); - query.prepare(QString("insert into " + m_strTable + " (article_url,article_nickname,article_data,article_date,article_parent,reply_url,article_order,platform_name,platform_form,article_form) VALUES (:URL,:NICK,:DATA,:DATE,:PARENT,:URLREPLY,:ROWNUM,'naver','blog','reply')").toUtf8()); query.bindValue(":URL",strUrl.toUtf8()); + query.bindValue(":PLATFORMID",strUrl.split("/").at(3).toUtf8()); query.bindValue(":NICK",strNick.toUtf8()); query.bindValue(":DATA",strComm.toUtf8()); query.bindValue(":DATE",strDate.toUtf8()); @@ -818,13 +740,12 @@ void SCrawler::saveFrameCafeList(QWebFrame *frame) if (strUrl.split("/").at(2) == "cafe.naver.com") { QSqlQuery sql; - /* - QString strQuery = "select URL from "; - strQuery += m_strTableBody; - strQuery += QString(" where URL = '%1'").arg(strUrl); + + QString strQuery = "select article_url from "; + strQuery += m_strTable; + strQuery += QString(" where article_url = '%1'").arg(strUrl); sql.exec(strQuery); - if (sql.size() == 0) - */ + if (sql.size() == 0 || sql.size() == -1) { QString strQuery = QString("insert into "); strQuery += m_strTable; @@ -833,8 +754,12 @@ void SCrawler::saveFrameCafeList(QWebFrame *frame) if (sql.exec(strUtf8) == false) cout << "x " << sql.lastError().text().toStdString(); else + { cout << "o " << strUrl.toStdString() << endl; + } } + else + cout << "v " << strUrl.toStdString() << endl; } m_bUse = true; } @@ -962,8 +887,9 @@ void SCrawler::saveFrameCafeUrl(QWebFrame *frame) if (strDate.isEmpty()) continue; QSqlQuery query; - query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,reply_url,article_order) VALUES ('naver','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:URLREPLY,:ROWNUM)").toUtf8()); + query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,platform_id,article_id,article_nickname,article_data,article_date,reply_url,article_order) VALUES ('naver','cafe','reply',:URL,:PLATFORMID,:ID,:NICK,:DATA,:DATE,:URLREPLY,:ROWNUM)").toUtf8()); query.bindValue(":URL",m_strUrl.toUtf8()); + query.bindValue(":PLATFORMID",m_strUrl.split("/").at(3).toUtf8()); query.bindValue(":ID",strID.toUtf8()); query.bindValue(":NICK",strNick.toUtf8()); query.bindValue(":DATA",strData.toUtf8()); @@ -986,8 +912,9 @@ void SCrawler::saveFrameCafeUrl(QWebFrame *frame) if (eleParent.toPlainText().isEmpty() == false) strReParent = eleParent.toPlainText(); QSqlQuery query; - query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,article_parent,reply_url,article_order) VALUES ('naver','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:PARENT,:URLREPLY,:ROWNUM)").toUtf8()); + query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,platform_id,article_id,article_nickname,article_data,article_date,article_parent,reply_url,article_order) VALUES ('naver','cafe','reply',:URL,:PLATFORMID,:ID,:NICK,:DATA,:DATE,:PARENT,:URLREPLY,:ROWNUM)").toUtf8()); query.bindValue(":URL",m_strUrl.toUtf8()); + query.bindValue(":PLATFORMID",m_strUrl.split("/").at(3).toUtf8()); query.bindValue(":ID",strID.toUtf8()); query.bindValue(":NICK",strNick.toUtf8()); query.bindValue(":DATA",strData.toUtf8()); @@ -1033,13 +960,12 @@ void SCrawler::saveFrameDaumCafeList(QWebFrame *frame) if (strUrl.split("/").at(2) == "cafe.daum.net") { QSqlQuery sql; - /* - QString strQuery = "select URL from "; - strQuery += m_strTableBody; - strQuery += QString(" where URL = '%1'").arg(strUrl); + + QString strQuery = "select article_url from "; + strQuery += m_strTable; + strQuery += QString(" where article_url = '%1'").arg(strUrl); sql.exec(strQuery); - if (sql.size() == 0) - */ + if (sql.size() == 0 || sql.size() == -1) { QString strQuery = QString("insert into "); strQuery += m_strTable; @@ -1050,6 +976,8 @@ void SCrawler::saveFrameDaumCafeList(QWebFrame *frame) else cout << "o " << strUrl.toStdString() << endl; } + else + cout << "v " << strUrl.toStdString() << endl; } m_bUse = true; } @@ -1126,27 +1054,10 @@ void SCrawler::saveFrameDaumCafeUrl(QWebFrame *frame) QWebElement group = Find(Find(frame->documentElement(),"div","class","article_writer"),"a","href","#"); strNick = group.toPlainText().trimmed(); - if (strNick.isEmpty() == false) - { - /* - QStringList list = strNick.split("("); - if (list.isEmpty() == false) - strNick = list.at(0); - */ - QWebElement id = Find(frame->documentElement(),"td","class","m-tcol-c b nick"); - QStringList list = Find(id,"a","class","m-tcol-c b").attribute("onclick").trimmed().split(","); - if (list.size() >= 2) - strID = list.at(1).trimmed().replace("'",""); - } - else - { - QStringList list = Find(Find(frame->documentElement(),"td","class","m-tcol-c b nick"),"a","class","m-tcol-c b").attribute("onclick").trimmed().split(","); - if (list.size() >= 4) - { - strID = list.at(1).trimmed().replace("'",""); - strNick = list.at(3).trimmed().replace("'",""); - } - } + QWebElement id = Find(frame->documentElement(),"div","class","article_writer"); + QStringList list = FindLeft(id,"a","onclick","showSideView").attribute("onclick").trimmed().split(","); + if (list.size() >= 2) + strID = list.at(1).trimmed().replace("'",""); } QStringList strList = Find(frame->documentElement(),"div","class","article_writer").toPlainText().split("|"); @@ -1210,15 +1121,31 @@ void SCrawler::saveFrameDaumCafeUrl(QWebFrame *frame) QString strData = SqlString(Find(element,"span","class","comment_contents").toPlainText().trimmed()); if (strData.isEmpty()) continue; strData = GetSafeUtf(strData); - QString strID = Find(element,"input","name","writerid").attribute("value").trimmed(); + + QString strID; + QStringList strListID = Find(element,"a","class","b").attribute("onclick").split(","); + if(strListID.length() > 2) + strID = strListID.at(1).trimmed().replace("'",""); + QString strNick = Find(element,"a","class","b").toPlainText().trimmed(); strParent = strNick; - QString strDate = Find(element,"span","class","comment_date txt_sub p11 ls0").toPlainText().trimmed().replace(".","-").replace("- "," "); + + QString strDatetest = Find(element,"span","class","comment_date txt_sub p11 ls0").toPlainText().trimmed(); + QString strDate; + if(strDatetest.count(".") == 0) + { + strDate = QDateTime::currentDateTime().toString("yyyy-MM-dd"); + strDate += (" " + strDatetest); + } + else + { + strDate = Find(element,"span","class","comment_date txt_sub p11 ls0").toPlainText().trimmed().replace(".","-").replace("- "," "); + } + if (strDate.isEmpty()) continue; else strDate += ":00"; QSqlQuery query; - //query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,reply_url,article_order) VALUES ('daum','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:URLREPLY,:ROWNUM)").toUtf8()); - //query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,article_order) VALUES ('daum','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:ROWNUM)").toUtf8()); + query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,article_order,platform_id,article_hit,platform_title) VALUES ('daum','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:ROWNUM,:PLATFORMID,:HITS,:TITLE)").toUtf8()); query.bindValue(":URL",m_strUrl.toUtf8()); query.bindValue(":ID",strID.toUtf8()); @@ -1227,7 +1154,7 @@ void SCrawler::saveFrameDaumCafeUrl(QWebFrame *frame) query.bindValue(":DATE",strDate.toUtf8()); //query.bindValue(":URLREPLY",m_strReper.toUtf8()); query.bindValue(":ROWNUM",nCount++); - query.bindValue(":PLATFORMID",m_strUrl.split("/").at(1).toUtf8()); + query.bindValue(":PLATFORMID",m_strUrl.split("/").at(3).toUtf8()); query.bindValue(":HITS",strHits.toUtf8()); query.bindValue(":TITLE",strTitle.toUtf8()); @@ -1243,15 +1170,28 @@ void SCrawler::saveFrameDaumCafeUrl(QWebFrame *frame) if(strReParent.length() == 0) strReParent = strParent; - QString strID = Find(element,"input","name","writerid").attribute("value").trimmed(); + QString strID; + QStringList strListID = Find(element,"a","class","b").attribute("onclick").split(","); + if(strListID.length() > 2) + strID = strListID.at(1).trimmed().replace("'",""); + QString strNick = Find(element,"a","class","b").toPlainText().trimmed(); - QString strDate = Find(element,"span","class","comment_date txt_sub p11 ls0").toPlainText().trimmed().replace(".","-").replace("- "," "); + QString strDatetest = Find(element,"span","class","comment_date txt_sub p11 ls0").toPlainText().trimmed(); + QString strDate; + if(strDatetest.count(".") == 0) + { + strDate = QDateTime::currentDateTime().toString("yyyy-MM-dd"); + strDate += (" " + strDatetest); + } + else + { + strDate = Find(element,"span","class","comment_date txt_sub p11 ls0").toPlainText().trimmed().replace(".","-").replace("- "," "); + } + if (strDate.isEmpty()) continue; else strDate += ":00"; - QSqlQuery query; - //query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,article_parent,reply_url,article_order) VALUES ('daum','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:PARENT,:URLREPLY,:ROWNUM)").toUtf8()); - //query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,article_parent,article_order) VALUES ('daum','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:PARENT,:ROWNUM)").toUtf8()); + query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,article_parent,article_order,platform_id,article_hit,platform_title) VALUES ('daum','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:PARENT,:ROWNUM,:PLATFORMID,:HITS,:TITLE)").toUtf8()); query.bindValue(":URL",m_strUrl.toUtf8()); query.bindValue(":ID",strID.toUtf8()); @@ -1261,7 +1201,7 @@ void SCrawler::saveFrameDaumCafeUrl(QWebFrame *frame) query.bindValue(":PARENT",strReParent.toUtf8()); //query.bindValue(":URLREPLY",m_strReper.toUtf8()); query.bindValue(":ROWNUM",nCount++); - query.bindValue(":PLATFORMID",m_strUrl.split("/").at(1).toUtf8()); + query.bindValue(":PLATFORMID",m_strUrl.split("/").at(3).toUtf8()); query.bindValue(":HITS",strHits.toUtf8()); query.bindValue(":TITLE",strTitle.toUtf8()); //QWebView::page()->mainFrame()->evaluateJavaScript("");