git-svn-id: svn://192.168.0.12/source@29 8346c931-da38-4b9b-9d4c-e48b93cbd075
This commit is contained in:
@@ -13,7 +13,7 @@ TEMPLATE = app
|
|||||||
|
|
||||||
SOURCES += main.cpp \
|
SOURCES += main.cpp \
|
||||||
scrawler.cpp \
|
scrawler.cpp \
|
||||||
scrawler_backup.cpp
|
|
||||||
|
|
||||||
HEADERS += \
|
HEADERS += \
|
||||||
scrawler.h
|
scrawler.h
|
||||||
|
|||||||
@@ -76,6 +76,33 @@ void SCrawler::load(QStringList _strlistArgv)
|
|||||||
m_nSelect = E_NAVER_BLOG_LIST;
|
m_nSelect = E_NAVER_BLOG_LIST;
|
||||||
m_strKeywordID = _strlistArgv[4];
|
m_strKeywordID = _strlistArgv[4];
|
||||||
//cout << "ok";
|
//cout << "ok";
|
||||||
|
|
||||||
|
QFile file("proxy.txt");
|
||||||
|
if (file.open(QIODevice::ReadOnly | QIODevice::Text))
|
||||||
|
{
|
||||||
|
QVector <QStringList> vecProxy;
|
||||||
|
while (!file.atEnd())
|
||||||
|
{
|
||||||
|
QString str = QString(file.readLine());
|
||||||
|
if (str.isEmpty()) continue;
|
||||||
|
vecProxy.push_back(str.split(","));
|
||||||
|
}
|
||||||
|
if (vecProxy.size() > 0)
|
||||||
|
{
|
||||||
|
QStringList strList = vecProxy.at(rand()%vecProxy.size());
|
||||||
|
switch(strList.size())
|
||||||
|
{
|
||||||
|
case 1:
|
||||||
|
cout << "p : " << strList.at(0).toStdString() << endl;
|
||||||
|
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0))));
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
cout << "p : " << strList.at(0).toStdString() << endl;
|
||||||
|
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt())));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (_strlistArgv[1] == "blog_url")
|
if (_strlistArgv[1] == "blog_url")
|
||||||
@@ -241,17 +268,16 @@ void SCrawler::saveFrameList(QWebFrame *frame)
|
|||||||
m_bUse = true;
|
m_bUse = true;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
cout << "url : " << strUrl.toStdString();
|
|
||||||
|
|
||||||
QStringList strList = strUrl.split('/');
|
QStringList strList = strUrl.split('/');
|
||||||
if (strList.at(0).compare("blog.naver.com") != 0 ) { cout << " not" << endl; continue; };
|
if (strList.at(0).compare("blog.naver.com") != 0 ) { cout << "x http://" << strUrl.toStdString() <<endl; continue; };
|
||||||
|
|
||||||
QString strQuery = "select article_url from ";
|
QString strQuery = "select article_url from ";
|
||||||
strQuery += m_strTable;
|
strQuery += m_strTable;
|
||||||
strQuery += QString(" where article_url = '%1'").arg(strUrl);
|
strQuery += QString(" where article_url = 'http://%1'").arg(strUrl);
|
||||||
sql.exec(strQuery);
|
sql.exec(strQuery);
|
||||||
|
|
||||||
if (sql.size() == -1)
|
if (sql.size() == 0 || sql.size() == -1)
|
||||||
{
|
{
|
||||||
QString str = Find(sub,"a","class","txt84").toPlainText();
|
QString str = Find(sub,"a","class","txt84").toPlainText();
|
||||||
str = GetSafeUtf(str);
|
str = GetSafeUtf(str);
|
||||||
@@ -261,15 +287,16 @@ void SCrawler::saveFrameList(QWebFrame *frame)
|
|||||||
|
|
||||||
QString strQuery = QString("insert into ");
|
QString strQuery = QString("insert into ");
|
||||||
strQuery += m_strTable;
|
strQuery += m_strTable;
|
||||||
strQuery += QString(" set article_url='%1',platform_id='%2',platform_title='%3',keyword_id='%4'").arg("http://"+strUrl).arg(strUrl.split("/").at(1)).arg(str).arg(m_strKeywordID);
|
strQuery += QString(" set platform_name='naver',platform_form='blog',article_form='body',article_url='%1',platform_id='%2',platform_title='%3',keyword_id='%4'").arg("http://"+strUrl).arg(strUrl.split("/").at(1)).arg(str).arg(m_strKeywordID);
|
||||||
QString strUtf8(strQuery.toUtf8());
|
QString strUtf8(strQuery.toUtf8());
|
||||||
if (sql.exec(strUtf8) == false)
|
if (sql.exec(strUtf8) == false)
|
||||||
cout << "error : " << sql.lastError().text().toStdString();
|
cout << "error : " << sql.lastError().text().toStdString();
|
||||||
else
|
else
|
||||||
cout << " ok" << endl;
|
cout << "o ";
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
cout << " overlap" << endl;
|
cout << "v ";
|
||||||
|
cout << "http://" << strUrl.toStdString() << endl;
|
||||||
m_bUse = true;
|
m_bUse = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -395,7 +422,6 @@ void SCrawler::saveFrameUrl(QWebFrame *frame)
|
|||||||
if ( str[E_DATA_DATE].isEmpty() == false)
|
if ( str[E_DATA_DATE].isEmpty() == false)
|
||||||
{
|
{
|
||||||
str[E_DATA_DATE] += ":00";
|
str[E_DATA_DATE] += ":00";
|
||||||
cout << "str[E_DATA_DATE] = " << str[E_DATA_DATE].toStdString() << endl;
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@@ -461,7 +487,7 @@ void SCrawler::saveFrameComment(QWebFrame *frame)
|
|||||||
strNick = strParent = Find(element,"a","class","nick pcol2").toPlainText();
|
strNick = strParent = Find(element,"a","class","nick pcol2").toPlainText();
|
||||||
strDate = Find(element,"span","class","date fil5 pcol2").toPlainText();
|
strDate = Find(element,"span","class","date fil5 pcol2").toPlainText();
|
||||||
strComm = Find(element,"dd","class","comm pcol2").toPlainText();
|
strComm = Find(element,"dd","class","comm pcol2").toPlainText();
|
||||||
//strComm = GetSafeUtf(strComm);
|
strComm = GetSafeUtf(strComm);
|
||||||
if (strComm.isEmpty()== false)
|
if (strComm.isEmpty()== false)
|
||||||
{
|
{
|
||||||
strComm.replace("'","\\'");
|
strComm.replace("'","\\'");
|
||||||
@@ -469,13 +495,15 @@ void SCrawler::saveFrameComment(QWebFrame *frame)
|
|||||||
strComm = strComm.trimmed();
|
strComm = strComm.trimmed();
|
||||||
//cout << strComm.toStdString() << endl;
|
//cout << strComm.toStdString() << endl;
|
||||||
QSqlQuery query;
|
QSqlQuery query;
|
||||||
query.prepare(QString("insert into " + m_strTable + " (article_url,article_nickname,article_data,article_date,article_parent,reply_url,article_order) VALUES (:URL,:NICK,:DATA,:DATE,:PARENT,:URLREPLY,:ROWNUM)").toUtf8());
|
query.prepare(QString("insert into " + m_strTable + " (article_url,article_nickname,article_data,article_date,article_parent,reply_url,article_order,platform_name,platform_form,article_form) VALUES (:URL,:NICK,:DATA,:DATE,:PARENT,:URLREPLY,:ROWNUM,'naver','blog','reply')").toUtf8());
|
||||||
|
/*
|
||||||
cout << "m_strTable = " << m_strTable.toStdString() << endl;
|
cout << "m_strTable = " << m_strTable.toStdString() << endl;
|
||||||
cout << "url = " << strUrl.toStdString() << endl << "nickname = " << strNick.toStdString() << endl;
|
cout << "url = " << strUrl.toStdString() << endl << "nickname = " << strNick.toStdString() << endl;
|
||||||
cout << "data = " << strComm.toStdString() << endl;
|
cout << "data = " << strComm.toStdString() << endl;
|
||||||
cout << "date = " << strDate.toStdString() << endl;
|
cout << "date = " << strDate.toStdString() << endl;
|
||||||
cout << "urlReply = " << m_strUrl.toStdString() << endl;
|
cout << "urlReply = " << m_strUrl.toStdString() << endl;
|
||||||
cout << "ronum = " << nCount << endl;
|
cout << "ronum = " << nCount << endl;
|
||||||
|
*/
|
||||||
query.bindValue(":URL", strUrl.toUtf8());
|
query.bindValue(":URL", strUrl.toUtf8());
|
||||||
query.bindValue(":NICK",strNick.toUtf8());
|
query.bindValue(":NICK",strNick.toUtf8());
|
||||||
query.bindValue(":DATA",strComm.toUtf8());
|
query.bindValue(":DATA",strComm.toUtf8());
|
||||||
@@ -503,19 +531,22 @@ void SCrawler::saveFrameComment(QWebFrame *frame)
|
|||||||
|
|
||||||
if (strComm.isEmpty() == false)
|
if (strComm.isEmpty() == false)
|
||||||
{
|
{
|
||||||
// strComm = GetSafeUtf(strComm);
|
strComm = GetSafeUtf(strComm);
|
||||||
strComm.replace("'","\\'");
|
strComm.replace("'","\\'");
|
||||||
strComm.replace("\"","\\\"");
|
strComm.replace("\"","\\\"");
|
||||||
strComm = strComm.trimmed();
|
strComm = strComm.trimmed();
|
||||||
QSqlQuery query;
|
QSqlQuery query;
|
||||||
cout << "m_strTable = " << m_strTable.toStdString() << endl;
|
/*
|
||||||
|
cout << "m_strTable = " << m_strTable.toStdString() << endl;
|
||||||
cout << "url = " << strUrl.toStdString() << endl << "nickname = " << strNick.toStdString() << endl;
|
cout << "url = " << strUrl.toStdString() << endl << "nickname = " << strNick.toStdString() << endl;
|
||||||
cout << "data = " << strComm.toStdString() << endl;
|
cout << "data = " << strComm.toStdString() << endl;
|
||||||
cout << "date = " << strDate.toStdString() << endl;
|
cout << "date = " << strDate.toStdString() << endl;
|
||||||
cout << "urlReply = " << m_strUrl.toStdString() << endl;
|
cout << "urlReply = " << m_strUrl.toStdString() << endl;
|
||||||
cout << "ronum = " << nCount << endl;
|
cout << "ronum = " << nCount << endl;
|
||||||
cout << "parent = " << strParent.toStdString() << endl;
|
cout << "parent = " << strParent.toStdString() << endl;
|
||||||
query.prepare(QString("insert into " + m_strTable + " (article_url,article_nickname,article_data,article_date,article_parent,reply_url,article_order) VALUES (:URL,:NICK,:DATA,:DATE,:PARENT,:URLREPLY,:ROWNUM)").toUtf8());
|
*/
|
||||||
|
|
||||||
|
query.prepare(QString("insert into " + m_strTable + " (article_url,article_nickname,article_data,article_date,article_parent,reply_url,article_order,platform_name,platform_form,article_form) VALUES (:URL,:NICK,:DATA,:DATE,:PARENT,:URLREPLY,:ROWNUM,'naver','blog','reply')").toUtf8());
|
||||||
query.bindValue(":URL",strUrl.toUtf8());
|
query.bindValue(":URL",strUrl.toUtf8());
|
||||||
query.bindValue(":NICK",strNick.toUtf8());
|
query.bindValue(":NICK",strNick.toUtf8());
|
||||||
query.bindValue(":DATA",strComm.toUtf8());
|
query.bindValue(":DATA",strComm.toUtf8());
|
||||||
|
|||||||
@@ -58,6 +58,13 @@ void SNaverBlogManage::processFinished(QProcess *pPro,QString _strOut)
|
|||||||
if (_strOut.right(4) == "last" || m_ncList >= 991)
|
if (_strOut.right(4) == "last" || m_ncList >= 991)
|
||||||
m_bLast = true;
|
m_bLast = true;
|
||||||
m_strListURL.clear();
|
m_strListURL.clear();
|
||||||
|
foreach(QString str,_strOut.split("\n"))
|
||||||
|
{
|
||||||
|
if (str.isEmpty()) continue;
|
||||||
|
if (str.at(0) == QChar('o'))
|
||||||
|
m_strListURL.push_back(str.right(str.length()-2).trimmed());
|
||||||
|
}
|
||||||
|
/*
|
||||||
QSqlQuery query;
|
QSqlQuery query;
|
||||||
if(query.exec("SELECT ARTICLE_URL FROM " + C_TABLE_URL + QString::number(m_nUrlTable) + " where ERROR is null"))
|
if(query.exec("SELECT ARTICLE_URL FROM " + C_TABLE_URL + QString::number(m_nUrlTable) + " where ERROR is null"))
|
||||||
{
|
{
|
||||||
@@ -66,6 +73,7 @@ void SNaverBlogManage::processFinished(QProcess *pPro,QString _strOut)
|
|||||||
|
|
||||||
while (query.next())
|
while (query.next())
|
||||||
m_strListURL.append(query.value(0).toString());
|
m_strListURL.append(query.value(0).toString());
|
||||||
|
*/
|
||||||
m_ncUrl = 0;
|
m_ncUrl = 0;
|
||||||
if (m_strListURL.size() == 0)
|
if (m_strListURL.size() == 0)
|
||||||
{
|
{
|
||||||
@@ -108,7 +116,8 @@ bool SNaverBlogManage::Update()
|
|||||||
m_pMain->InsertLog(m_nID,"Start : " + QString::number(m_ncList) + " Date : " + m_date.toString("yyyy-MM-dd"));
|
m_pMain->InsertLog(m_nID,"Start : " + QString::number(m_ncList) + " Date : " + m_date.toString("yyyy-MM-dd"));
|
||||||
//for(int i = 0; i < C_PROCESS_MAX ; i++)
|
//for(int i = 0; i < C_PROCESS_MAX ; i++)
|
||||||
{
|
{
|
||||||
m_pro[0].start("CrawlerProcess",QStringList()<< "naver" << "blog_list" << QString::number(m_nUrlTable) << m_strQuery << m_strKeywordID);
|
m_pro[0].start("CrawlerProcess",QStringList()<< "naver" << "blog_list" << m_strQuery << m_strGroupID << m_strKeywordID);
|
||||||
|
// m_pro[0].SetState(SProcess::STATE_RUNNING);
|
||||||
m_ncList+=10;
|
m_ncList+=10;
|
||||||
}
|
}
|
||||||
m_nMode = E_PROCESS_LIST_FINISH_WAIT;
|
m_nMode = E_PROCESS_LIST_FINISH_WAIT;
|
||||||
@@ -119,13 +128,11 @@ bool SNaverBlogManage::Update()
|
|||||||
if (UseProcess() == false)
|
if (UseProcess() == false)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
m_pMain->InsertLog(m_nID,"(" + QString::number(m_ncUrl+1) + "/" + QString::number(m_strListURL.size()) + ")");
|
|
||||||
for(int i = 0; i < C_PROCESS_MAX ; i++)
|
for(int i = 0; i < C_PROCESS_MAX ; i++)
|
||||||
{
|
{
|
||||||
m_pro[0].start("CrawlerProcess",QStringList() << m_strListURL.at(m_ncUrl) << "url");
|
m_pro[0].start("CrawlerProcess",QStringList() << m_strListURL.at(m_ncUrl) << "url");
|
||||||
}
|
}
|
||||||
*/
|
|
||||||
/*
|
|
||||||
int i = 0;
|
int i = 0;
|
||||||
foreach(QString strUrl,m_strListURL)
|
foreach(QString strUrl,m_strListURL)
|
||||||
//for(int i = 0; i < C_PROCESS_MAX ; i++)
|
//for(int i = 0; i < C_PROCESS_MAX ; i++)
|
||||||
@@ -135,7 +142,9 @@ bool SNaverBlogManage::Update()
|
|||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
//m_pro[0].start("CrawlerProcess",QStringList() << m_strListURL.at(m_ncUrl) << "blog_url" << C_TABLE_URL + QString::number(m_nUrlTable));
|
//m_pro[0].start("CrawlerProcess",QStringList() << m_strListURL.at(m_ncUrl) << "blog_url" << C_TABLE_URL + QString::number(m_nUrlTable));
|
||||||
m_pro[0].start("CrawlerProcess",QStringList() << "naver" << "blog_url" << QString::number(m_nUrlTable) << m_strListURL.at(m_ncUrl) << makeGetCommentQuery(m_strListURL.at(m_ncUrl)) << "" );
|
m_pMain->InsertLog(m_nID,"(" + QString::number(m_ncUrl+1) + "/" + QString::number(m_strListURL.size()) + ")");
|
||||||
|
m_pro[0].start("CrawlerProcess",QStringList() << "naver" << "blog_url" << m_strListURL.at(m_ncUrl) << m_strGroupID << "" );
|
||||||
|
// m_pro[0].SetState(SProcess::STATE_RUNNING);
|
||||||
m_nMode = E_PROCESS_URL_FINISH_WAIT;
|
m_nMode = E_PROCESS_URL_FINISH_WAIT;
|
||||||
m_nWait = 0;
|
m_nWait = 0;
|
||||||
}
|
}
|
||||||
@@ -150,8 +159,10 @@ bool SNaverBlogManage::Update()
|
|||||||
//if (i >= C_PROCESS_MAX) break;
|
//if (i >= C_PROCESS_MAX) break;
|
||||||
//m_ncUrl++;
|
//m_ncUrl++;
|
||||||
}
|
}
|
||||||
m_pro[0].start("CrawlerProcess",QStringList() << "naver" << "blog_comm" << QString::number(m_nUrlTable) << makeGetCommentQuery(m_strListURL.at(m_ncUrl++)) << "" );
|
//m_pMain->InsertLog(m_nID,"(" + QString::number(m_ncUrl+1) + "/" + QString::number(m_strListURL.size()) + ")");
|
||||||
|
m_pro[0].start("CrawlerProcess",QStringList() << "naver" << "blog_comm" << makeGetCommentQuery(m_strListURL.at(m_ncUrl++)) << m_strGroupID << "" );
|
||||||
//m_pro[0].start("CrawlerProcess",QStringList() << makeGetCommentQuery(m_strListURL.at(m_ncUrl++))<< "blog_comm" << C_TABLE_COM + QString::number(m_nUrlTable));
|
//m_pro[0].start("CrawlerProcess",QStringList() << makeGetCommentQuery(m_strListURL.at(m_ncUrl++))<< "blog_comm" << C_TABLE_COM + QString::number(m_nUrlTable));
|
||||||
|
// m_pro[0].SetState(SProcess::STATE_RUNNING);
|
||||||
m_nMode = E_PROCESS_COMMENT_FINISH_WAIT;
|
m_nMode = E_PROCESS_COMMENT_FINISH_WAIT;
|
||||||
m_nWait = 0;
|
m_nWait = 0;
|
||||||
}
|
}
|
||||||
@@ -162,12 +173,15 @@ bool SNaverBlogManage::Update()
|
|||||||
m_nWait++;
|
m_nWait++;
|
||||||
if (m_nWait > (1000000/m_nTime))
|
if (m_nWait > (1000000/m_nTime))
|
||||||
{
|
{
|
||||||
for(int i = 0; i < C_PROCESS_MAX ; i++)
|
//for(int i = 0; i < C_PROCESS_MAX ; i++)
|
||||||
{
|
{
|
||||||
m_pro[i].kill();
|
{
|
||||||
m_pMain->InsertLog(m_nID,"Kill Process.");
|
m_pro[0].kill();
|
||||||
|
m_pMain->InsertLog(m_nID,"Kill Process.");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (m_nMode == E_PROCESS_LIST_FINISH_WAIT) return m_bFinalLast;
|
if (m_nMode == E_PROCESS_LIST_FINISH_WAIT) return m_bFinalLast;
|
||||||
|
/*
|
||||||
QString strQuery = "update blog set ";
|
QString strQuery = "update blog set ";
|
||||||
strQuery += "Error ";
|
strQuery += "Error ";
|
||||||
strQuery += "='";
|
strQuery += "='";
|
||||||
@@ -181,6 +195,7 @@ bool SNaverBlogManage::Update()
|
|||||||
strQuery += "'";
|
strQuery += "'";
|
||||||
QSqlQuery sql;
|
QSqlQuery sql;
|
||||||
sql.exec(strQuery);
|
sql.exec(strQuery);
|
||||||
|
*/
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user