다음카페 댓글 시간만 표시될때 크롤링 에러나는 부분 임시 수정

git-svn-id: svn://192.168.0.12/source@61 8346c931-da38-4b9b-9d4c-e48b93cbd075
This commit is contained in:
admin
2015-03-23 06:22:18 +00:00
parent c30f40e0b5
commit a446318723

View File

@@ -61,35 +61,8 @@ void SCrawler::load(QStringList _strlistArgv)
break;
}
}
}
QString proxyList;
if (getProxyList(proxyList))
{
QVector <QStringList> vecProxy;
QStringList strListProxy = proxyList.split("\n");
foreach(QString str, strListProxy)
{
str = str.trimmed();
if (str.isEmpty()) continue;
vecProxy.push_back(str.split(","));
}
if (vecProxy.size() > 0)
{
QStringList strList = vecProxy.at(rand()%vecProxy.size());
switch(strList.size())
{
case 1:
cout << "p : " << strList.at(0).toStdString() << endl;
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0))));
break;
case 2:
cout << "p : " << strList.at(0).toStdString() << ", " << strList.at(1).toStdString() << endl;
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt())));
break;
}
}
}
*/
}*/
setProxy();
}
@@ -106,34 +79,7 @@ void SCrawler::load(QStringList _strlistArgv)
m_strUrl = _strlistArgv[2];
m_nSelect = E_NAVER_BLOG_LIST;
m_strKeywordID = _strlistArgv[4];
//cout << "ok";
/*
QFile file("proxy.txt");
if (file.open(QIODevice::ReadOnly | QIODevice::Text))
{
QVector <QStringList> vecProxy;
while (!file.atEnd())
{
QString str = QString(file.readLine());
if (str.isEmpty()) continue;
vecProxy.push_back(str.split(","));
}
if (vecProxy.size() > 0)
{
QStringList strList = vecProxy.at(rand()%vecProxy.size());
switch(strList.size())
{
case 1:
cout << "p : " << strList.at(0).toStdString() << endl;
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0))));
break;
case 2:
cout << "p : " << strList.at(0).toStdString() << endl;
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt())));
break;
}
}
}*/
setProxy();
}
@@ -162,33 +108,6 @@ void SCrawler::load(QStringList _strlistArgv)
m_nSelect = E_DAUM_CAFE_LIST;
m_strKeywordID = _strlistArgv[4];
setProxy();
/*
QFile file("proxy.txt");
if (file.open(QIODevice::ReadOnly | QIODevice::Text))
{
QVector <QStringList> vecProxy;
while (!file.atEnd())
{
QString str = QString(file.readLine());
if (str.isEmpty()) continue;
vecProxy.push_back(str.split(","));
}
if (vecProxy.size() > 0)
{
QStringList strList = vecProxy.at(rand()%vecProxy.size());
switch(strList.size())
{
case 1:
cout << "p : " << strList.at(0).toStdString() << endl;
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0))));
break;
case 2:
cout << "p : " << strList.at(0).toStdString() << endl;
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt())));
break;
}
}
}*/
}
if (_strlistArgv[1] == "cafe_data")
@@ -204,33 +123,7 @@ void SCrawler::load(QStringList _strlistArgv)
m_nSelect = E_DAUM_BLOG_LIST;
m_strKeywordID = _strlistArgv[4];
//cout << "ok";
QFile file("proxy.txt");
if (file.open(QIODevice::ReadOnly | QIODevice::Text))
{
QVector <QStringList> vecProxy;
while (!file.atEnd())
{
QString str = QString(file.readLine());
if (str.isEmpty()) continue;
vecProxy.push_back(str.split(","));
}
if (vecProxy.size() > 0)
{
QStringList strList = vecProxy.at(rand()%vecProxy.size());
switch(strList.size())
{
case 1:
cout << "p : " << strList.at(0).toStdString() << endl;
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0))));
break;
case 2:
cout << "p : " << strList.at(0).toStdString() << endl;
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt())));
break;
}
}
}
setProxy();
}
if (_strlistArgv[1] == "blog_url")
@@ -697,9 +590,9 @@ void SCrawler::saveFrameComment(QWebFrame *frame)
{
QWebElement group = Find(frame->documentElement(),"ul","id","commentList");
QWebElementCollection elements = group.findAll("li");
QString strParent,strDate,strNick,strComm,strUrl;
QString strParent,strDate,strNick,strComm,strUrl,strId;
QStringList strList = m_strUrl.split("/");
QString strCommUrl;
for (int i=0; i < 5; i++)
strUrl += strList.at(i) + "/";
@@ -713,6 +606,13 @@ void SCrawler::saveFrameComment(QWebFrame *frame)
strNick = strParent = GetSafeUtf(Find(element,"a","class","nick pcol2").toPlainText());
strDate = Find(element,"span","class","date fil5 pcol2").toPlainText();
strComm = Find(element,"dd","class","comm pcol2").toPlainText();
strCommUrl = Find(element,"a","class","nick pcol2").attribute("href");
if(strCommUrl.left(QString("http://blog.naver.com").length()).compare("http://blog.naver.com") == 0)
strId = strCommUrl.split("/").at(3).trimmed();
if(strCommUrl.right(QString("blog.me").length()).compare("blog.me") == 0)
strId = strCommUrl.split("/").at(2).split(".").at(0).trimmed();
strComm = GetSafeUtf(strComm);
if (strComm.isEmpty()== false)
{
@@ -721,7 +621,13 @@ void SCrawler::saveFrameComment(QWebFrame *frame)
strComm = strComm.trimmed();
//cout << strComm.toStdString() << endl;
QSqlQuery query;
query.prepare(QString("insert into " + m_strTable + " (article_url,article_nickname,article_data,article_date,reply_url,article_order,platform_name,platform_form,article_form) VALUES (:URL,:NICK,:DATA,:DATE,:URLREPLY,:ROWNUM,'naver','blog','reply')").toUtf8());
if(strId.length() > 0)
{
query.prepare(QString("insert into " + m_strTable + " (article_id,article_url,platform_id,article_nickname,article_data,article_date,reply_url,article_order,platform_name,platform_form,article_form) VALUES (:ID,:URL,:PLATFORMID,:NICK,:DATA,:DATE,:URLREPLY,:ROWNUM,'naver','blog','reply')").toUtf8());
query.bindValue(":ID", strId.toUtf8());
}
else
query.prepare(QString("insert into " + m_strTable + " (article_url,platform_id,article_nickname,article_data,article_date,reply_url,article_order,platform_name,platform_form,article_form) VALUES (:URL,:PLATFORMID,:NICK,:DATA,:DATE,:URLREPLY,:ROWNUM,'naver','blog','reply')").toUtf8());
/*
cout << "m_strTable = " << m_strTable.toStdString() << endl;
cout << "url = " << strUrl.toStdString() << endl << "nickname = " << strNick.toStdString() << endl;
@@ -730,7 +636,9 @@ void SCrawler::saveFrameComment(QWebFrame *frame)
cout << "urlReply = " << m_strUrl.toStdString() << endl;
cout << "ronum = " << nCount << endl;
*/
query.bindValue(":URL", strUrl.toUtf8());
query.bindValue(":PLATFORMID",strUrl.split("/").at(3).toUtf8());
query.bindValue(":NICK",strNick.toUtf8());
query.bindValue(":DATA",strComm.toUtf8());
query.bindValue(":DATE",strDate.toUtf8());
@@ -749,6 +657,13 @@ void SCrawler::saveFrameComment(QWebFrame *frame)
QWebElement subElement = Find(element,"dd","class","comm pcol2");
QString subNick = Find(subElement,"a","class","nick pcol2").toPlainText();
strComm = subElement.toPlainText();
strCommUrl = Find(element,"a","class","nick pcol2").attribute("href");
if(strCommUrl.left(QString("http://blog.naver.com").length()).compare("http://blog.naver.com") == 0)
strId = strCommUrl.split("/").at(3).trimmed();
if(strCommUrl.right(QString("blog.me").length()).compare("blog.me") == 0)
strId = strCommUrl.split("/").at(2).split(".").at(0).trimmed();
if(subNick.isEmpty() == false)
{
strNick = strParent;
@@ -771,9 +686,16 @@ void SCrawler::saveFrameComment(QWebFrame *frame)
cout << "ronum = " << nCount << endl;
cout << "parent = " << strParent.toStdString() << endl;
*/
if(strId.length() > 0)
{
query.prepare(QString("insert into " + m_strTable + " (article_id,article_url,platform_id,article_nickname,article_data,article_date,reply_url,article_order,platform_name,platform_form,article_form) VALUES (:ID,:URL,:PLATFORMID,:NICK,:DATA,:DATE,:URLREPLY,:ROWNUM,'naver','blog','reply')").toUtf8());
query.bindValue(":ID", strId.toUtf8());
}
else
query.prepare(QString("insert into " + m_strTable + " (article_url,platform_id,article_nickname,article_data,article_date,reply_url,article_order,platform_name,platform_form,article_form) VALUES (:URL,:PLATFORMID,:NICK,:DATA,:DATE,:URLREPLY,:ROWNUM,'naver','blog','reply')").toUtf8());
query.prepare(QString("insert into " + m_strTable + " (article_url,article_nickname,article_data,article_date,article_parent,reply_url,article_order,platform_name,platform_form,article_form) VALUES (:URL,:NICK,:DATA,:DATE,:PARENT,:URLREPLY,:ROWNUM,'naver','blog','reply')").toUtf8());
query.bindValue(":URL",strUrl.toUtf8());
query.bindValue(":PLATFORMID",strUrl.split("/").at(3).toUtf8());
query.bindValue(":NICK",strNick.toUtf8());
query.bindValue(":DATA",strComm.toUtf8());
query.bindValue(":DATE",strDate.toUtf8());
@@ -818,13 +740,12 @@ void SCrawler::saveFrameCafeList(QWebFrame *frame)
if (strUrl.split("/").at(2) == "cafe.naver.com")
{
QSqlQuery sql;
/*
QString strQuery = "select URL from ";
strQuery += m_strTableBody;
strQuery += QString(" where URL = '%1'").arg(strUrl);
QString strQuery = "select article_url from ";
strQuery += m_strTable;
strQuery += QString(" where article_url = '%1'").arg(strUrl);
sql.exec(strQuery);
if (sql.size() == 0)
*/
if (sql.size() == 0 || sql.size() == -1)
{
QString strQuery = QString("insert into ");
strQuery += m_strTable;
@@ -833,9 +754,13 @@ void SCrawler::saveFrameCafeList(QWebFrame *frame)
if (sql.exec(strUtf8) == false)
cout << "x " << sql.lastError().text().toStdString();
else
{
cout << "o " << strUrl.toStdString() << endl;
}
}
else
cout << "v " << strUrl.toStdString() << endl;
}
m_bUse = true;
}
}
@@ -962,8 +887,9 @@ void SCrawler::saveFrameCafeUrl(QWebFrame *frame)
if (strDate.isEmpty()) continue;
QSqlQuery query;
query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,reply_url,article_order) VALUES ('naver','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:URLREPLY,:ROWNUM)").toUtf8());
query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,platform_id,article_id,article_nickname,article_data,article_date,reply_url,article_order) VALUES ('naver','cafe','reply',:URL,:PLATFORMID,:ID,:NICK,:DATA,:DATE,:URLREPLY,:ROWNUM)").toUtf8());
query.bindValue(":URL",m_strUrl.toUtf8());
query.bindValue(":PLATFORMID",m_strUrl.split("/").at(3).toUtf8());
query.bindValue(":ID",strID.toUtf8());
query.bindValue(":NICK",strNick.toUtf8());
query.bindValue(":DATA",strData.toUtf8());
@@ -986,8 +912,9 @@ void SCrawler::saveFrameCafeUrl(QWebFrame *frame)
if (eleParent.toPlainText().isEmpty() == false)
strReParent = eleParent.toPlainText();
QSqlQuery query;
query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,article_parent,reply_url,article_order) VALUES ('naver','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:PARENT,:URLREPLY,:ROWNUM)").toUtf8());
query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,platform_id,article_id,article_nickname,article_data,article_date,article_parent,reply_url,article_order) VALUES ('naver','cafe','reply',:URL,:PLATFORMID,:ID,:NICK,:DATA,:DATE,:PARENT,:URLREPLY,:ROWNUM)").toUtf8());
query.bindValue(":URL",m_strUrl.toUtf8());
query.bindValue(":PLATFORMID",m_strUrl.split("/").at(3).toUtf8());
query.bindValue(":ID",strID.toUtf8());
query.bindValue(":NICK",strNick.toUtf8());
query.bindValue(":DATA",strData.toUtf8());
@@ -1033,13 +960,12 @@ void SCrawler::saveFrameDaumCafeList(QWebFrame *frame)
if (strUrl.split("/").at(2) == "cafe.daum.net")
{
QSqlQuery sql;
/*
QString strQuery = "select URL from ";
strQuery += m_strTableBody;
strQuery += QString(" where URL = '%1'").arg(strUrl);
QString strQuery = "select article_url from ";
strQuery += m_strTable;
strQuery += QString(" where article_url = '%1'").arg(strUrl);
sql.exec(strQuery);
if (sql.size() == 0)
*/
if (sql.size() == 0 || sql.size() == -1)
{
QString strQuery = QString("insert into ");
strQuery += m_strTable;
@@ -1050,6 +976,8 @@ void SCrawler::saveFrameDaumCafeList(QWebFrame *frame)
else
cout << "o " << strUrl.toStdString() << endl;
}
else
cout << "v " << strUrl.toStdString() << endl;
}
m_bUse = true;
}
@@ -1126,28 +1054,11 @@ void SCrawler::saveFrameDaumCafeUrl(QWebFrame *frame)
QWebElement group = Find(Find(frame->documentElement(),"div","class","article_writer"),"a","href","#");
strNick = group.toPlainText().trimmed();
if (strNick.isEmpty() == false)
{
/*
QStringList list = strNick.split("(");
if (list.isEmpty() == false)
strNick = list.at(0);
*/
QWebElement id = Find(frame->documentElement(),"td","class","m-tcol-c b nick");
QStringList list = Find(id,"a","class","m-tcol-c b").attribute("onclick").trimmed().split(",");
QWebElement id = Find(frame->documentElement(),"div","class","article_writer");
QStringList list = FindLeft(id,"a","onclick","showSideView").attribute("onclick").trimmed().split(",");
if (list.size() >= 2)
strID = list.at(1).trimmed().replace("'","");
}
else
{
QStringList list = Find(Find(frame->documentElement(),"td","class","m-tcol-c b nick"),"a","class","m-tcol-c b").attribute("onclick").trimmed().split(",");
if (list.size() >= 4)
{
strID = list.at(1).trimmed().replace("'","");
strNick = list.at(3).trimmed().replace("'","");
}
}
}
QStringList strList = Find(frame->documentElement(),"div","class","article_writer").toPlainText().split("|");
@@ -1210,15 +1121,31 @@ void SCrawler::saveFrameDaumCafeUrl(QWebFrame *frame)
QString strData = SqlString(Find(element,"span","class","comment_contents").toPlainText().trimmed());
if (strData.isEmpty()) continue;
strData = GetSafeUtf(strData);
QString strID = Find(element,"input","name","writerid").attribute("value").trimmed();
QString strID;
QStringList strListID = Find(element,"a","class","b").attribute("onclick").split(",");
if(strListID.length() > 2)
strID = strListID.at(1).trimmed().replace("'","");
QString strNick = Find(element,"a","class","b").toPlainText().trimmed();
strParent = strNick;
QString strDate = Find(element,"span","class","comment_date txt_sub p11 ls0").toPlainText().trimmed().replace(".","-").replace("- "," ");
QString strDatetest = Find(element,"span","class","comment_date txt_sub p11 ls0").toPlainText().trimmed();
QString strDate;
if(strDatetest.count(".") == 0)
{
strDate = QDateTime::currentDateTime().toString("yyyy-MM-dd");
strDate += (" " + strDatetest);
}
else
{
strDate = Find(element,"span","class","comment_date txt_sub p11 ls0").toPlainText().trimmed().replace(".","-").replace("- "," ");
}
if (strDate.isEmpty()) continue;
else strDate += ":00";
QSqlQuery query;
//query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,reply_url,article_order) VALUES ('daum','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:URLREPLY,:ROWNUM)").toUtf8());
//query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,article_order) VALUES ('daum','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:ROWNUM)").toUtf8());
query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,article_order,platform_id,article_hit,platform_title) VALUES ('daum','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:ROWNUM,:PLATFORMID,:HITS,:TITLE)").toUtf8());
query.bindValue(":URL",m_strUrl.toUtf8());
query.bindValue(":ID",strID.toUtf8());
@@ -1227,7 +1154,7 @@ void SCrawler::saveFrameDaumCafeUrl(QWebFrame *frame)
query.bindValue(":DATE",strDate.toUtf8());
//query.bindValue(":URLREPLY",m_strReper.toUtf8());
query.bindValue(":ROWNUM",nCount++);
query.bindValue(":PLATFORMID",m_strUrl.split("/").at(1).toUtf8());
query.bindValue(":PLATFORMID",m_strUrl.split("/").at(3).toUtf8());
query.bindValue(":HITS",strHits.toUtf8());
query.bindValue(":TITLE",strTitle.toUtf8());
@@ -1243,15 +1170,28 @@ void SCrawler::saveFrameDaumCafeUrl(QWebFrame *frame)
if(strReParent.length() == 0)
strReParent = strParent;
QString strID = Find(element,"input","name","writerid").attribute("value").trimmed();
QString strID;
QStringList strListID = Find(element,"a","class","b").attribute("onclick").split(",");
if(strListID.length() > 2)
strID = strListID.at(1).trimmed().replace("'","");
QString strNick = Find(element,"a","class","b").toPlainText().trimmed();
QString strDate = Find(element,"span","class","comment_date txt_sub p11 ls0").toPlainText().trimmed().replace(".","-").replace("- "," ");
QString strDatetest = Find(element,"span","class","comment_date txt_sub p11 ls0").toPlainText().trimmed();
QString strDate;
if(strDatetest.count(".") == 0)
{
strDate = QDateTime::currentDateTime().toString("yyyy-MM-dd");
strDate += (" " + strDatetest);
}
else
{
strDate = Find(element,"span","class","comment_date txt_sub p11 ls0").toPlainText().trimmed().replace(".","-").replace("- "," ");
}
if (strDate.isEmpty()) continue;
else strDate += ":00";
QSqlQuery query;
//query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,article_parent,reply_url,article_order) VALUES ('daum','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:PARENT,:URLREPLY,:ROWNUM)").toUtf8());
//query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,article_parent,article_order) VALUES ('daum','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:PARENT,:ROWNUM)").toUtf8());
query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,article_parent,article_order,platform_id,article_hit,platform_title) VALUES ('daum','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:PARENT,:ROWNUM,:PLATFORMID,:HITS,:TITLE)").toUtf8());
query.bindValue(":URL",m_strUrl.toUtf8());
query.bindValue(":ID",strID.toUtf8());
@@ -1261,7 +1201,7 @@ void SCrawler::saveFrameDaumCafeUrl(QWebFrame *frame)
query.bindValue(":PARENT",strReParent.toUtf8());
//query.bindValue(":URLREPLY",m_strReper.toUtf8());
query.bindValue(":ROWNUM",nCount++);
query.bindValue(":PLATFORMID",m_strUrl.split("/").at(1).toUtf8());
query.bindValue(":PLATFORMID",m_strUrl.split("/").at(3).toUtf8());
query.bindValue(":HITS",strHits.toUtf8());
query.bindValue(":TITLE",strTitle.toUtf8());
//QWebView::page()->mainFrame()->evaluateJavaScript("");