tree 구조 데이타
git-svn-id: svn://192.168.0.12/source@11 8346c931-da38-4b9b-9d4c-e48b93cbd075
This commit is contained in:
@@ -19,42 +19,34 @@ SCrawler::~SCrawler()
|
||||
|
||||
void SCrawler::load(QStringList _strlistArgv)
|
||||
{
|
||||
//"naver" << "cafe_list" << QString::number(m_nUrlTable) << m_strListQuery << m_strKeywordID
|
||||
//"naver" << "cafe_data" << QString::number(m_nUrlTable) << m_strListURL.at(m_ncUrl) << m_strListQuery << ""
|
||||
|
||||
|
||||
//m_pro[0].start("CrawlerProcess",QStringList() << m_strListURL.at(m_ncUrl) << "blog_url" << C_TABLE_URL + QString::number(m_nUrlTable));
|
||||
//m_pro[0].start("CrawlerProcess",QStringList() << "naver" << "blog_url" << QString::number(m_nUrlTable) << m_strListURL.at(m_ncUrl) << makeGetCommentQuery(m_strListURL.at(m_ncUrl)) << "" );
|
||||
//m_pro[0].start("CrawlerProcess",QStringList() << "naver" << "blog_url" << QString::number(m_nUrlTable) << makeGetCommentQuery(m_strListURL.at(m_ncUrl++)) << "" );
|
||||
|
||||
m_bUse = false;
|
||||
|
||||
if (_strlistArgv[0] == "naver")
|
||||
{
|
||||
if (_strlistArgv[1] == "cafe_list")
|
||||
{
|
||||
m_strUrl = _strlistArgv[3];
|
||||
m_strUrl = _strlistArgv[2];
|
||||
m_nSelect = E_NAVER_CAFE_LIST;
|
||||
m_strKeywordID = _strlistArgv[4];
|
||||
}
|
||||
|
||||
if (_strlistArgv[1] == "cafe_data")
|
||||
{
|
||||
m_strUrl = _strlistArgv[3];
|
||||
m_strUrl = _strlistArgv[2];
|
||||
m_nSelect = E_NAVER_CAFE_DATA;
|
||||
m_strReper = _strlistArgv[4];
|
||||
}
|
||||
|
||||
if (_strlistArgv[1] == "blog_list")
|
||||
{
|
||||
m_strUrl = _strlistArgv[3];
|
||||
m_strUrl = _strlistArgv[2];
|
||||
m_nSelect = E_NAVER_BLOG_LIST;
|
||||
m_strKeywordID = _strlistArgv[4];
|
||||
}
|
||||
|
||||
if (_strlistArgv[1] == "blog_url")
|
||||
{
|
||||
m_strUrl = _strlistArgv[3];
|
||||
m_strUrl = _strlistArgv[2];
|
||||
m_nSelect = E_NAVER_BLOG_BODY;
|
||||
//m_strReper = _strlistArgv[4];
|
||||
m_bUse = true;
|
||||
@@ -62,24 +54,10 @@ void SCrawler::load(QStringList _strlistArgv)
|
||||
|
||||
if (_strlistArgv[1] == "blog_comm")
|
||||
{
|
||||
m_strUrl = _strlistArgv[3];
|
||||
m_strUrl = _strlistArgv[2];
|
||||
m_nSelect = E_NAVER_BLOG_REPLY;
|
||||
}
|
||||
|
||||
switch(m_nSelect)
|
||||
{
|
||||
case E_NAVER_CAFE_LIST:
|
||||
case E_NAVER_CAFE_DATA:
|
||||
m_strTableBody = "NAVER_CAFE_BODY_" + _strlistArgv[2];
|
||||
m_strTableReply = "NAVER_CAFE_REPLY_" + _strlistArgv[2];
|
||||
break;
|
||||
case E_NAVER_BLOG_BODY:
|
||||
case E_NAVER_BLOG_LIST:
|
||||
case E_NAVER_BLOG_REPLY:
|
||||
m_strTableBody = "NAVER_BLOG_BODY_" + _strlistArgv[2];
|
||||
m_strTableReply = "NAVER_BLOG_REPLY_" + _strlistArgv[2];
|
||||
break;
|
||||
}
|
||||
m_strTable = "data_" + _strlistArgv[3];
|
||||
}
|
||||
|
||||
cout << m_strUrl.toStdString() << endl;
|
||||
@@ -94,17 +72,15 @@ void SCrawler::load(QStringList _strlistArgv)
|
||||
request->setRawHeader("Cache-Control","max-age=0");
|
||||
if (m_strReper.isEmpty() == false && m_nSelect == E_NAVER_CAFE_DATA)
|
||||
request->setRawHeader("Referer",m_strReper.toLocal8Bit());
|
||||
request->setRawHeader("Accept-Language","ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4,zh-CN;q=0.2,zh;q=0.2");
|
||||
m_page->networkAccessManager()->setCookieJar(new QNetworkCookieJar());
|
||||
m_page->mainFrame()->load(*request);
|
||||
//QTime time = QTime::currentTime().addSecs(10);
|
||||
//while(time > QTime::currentTime());
|
||||
request->setRawHeader("Accept-Language","ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4,zh-CN;q=0.2,zh;q=0.2");
|
||||
m_page->mainFrame()->load(*request);
|
||||
m_bLast = false;
|
||||
m_bError = false;
|
||||
}
|
||||
|
||||
void SCrawler::UpdateError(QString _strError)
|
||||
{
|
||||
/*
|
||||
QSqlQuery sql;
|
||||
_strError = "'" + _strError + "'";
|
||||
QString strQuery = "update " + m_strTableBody + " set ERROR = " + _strError;
|
||||
@@ -113,6 +89,7 @@ void SCrawler::UpdateError(QString _strError)
|
||||
strQuery += "'";
|
||||
QString strUtf8(strQuery.toUtf8());
|
||||
sql.exec(strUtf8);
|
||||
*/
|
||||
m_bError = true;
|
||||
}
|
||||
|
||||
@@ -186,11 +163,9 @@ void SCrawler::Debug(QString _strFilename,QString _strData)
|
||||
}
|
||||
|
||||
QString SCrawler::SqlString(QString _str)
|
||||
{
|
||||
_str = _str.replace("'","");
|
||||
_str = _str.replace("\"","");
|
||||
_str = _str.replace("\n","");
|
||||
_str = _str.replace(",","");
|
||||
{
|
||||
_str = _str.replace("'","\\'");
|
||||
_str = _str.replace("\"","\\\"");
|
||||
return _str;
|
||||
}
|
||||
|
||||
@@ -234,7 +209,7 @@ void SCrawler::saveFrameList(QWebFrame *frame)
|
||||
if (strList.at(0).compare("blog.naver.com") != 0 ) { cout << " not" << endl; continue; };
|
||||
|
||||
QString strQuery = "select URL from ";
|
||||
strQuery += m_strTableBody;
|
||||
strQuery += m_strTable;
|
||||
strQuery += QString(" where URL = '%1'").arg(strUrl);
|
||||
sql.exec(strQuery);
|
||||
|
||||
@@ -247,8 +222,8 @@ void SCrawler::saveFrameList(QWebFrame *frame)
|
||||
str = str.trimmed();
|
||||
|
||||
QString strQuery = QString("insert into ");
|
||||
strQuery += m_strTableBody;
|
||||
strQuery += QString(" set Url='%1',PlatformID='%2',PlatformTitle='%3',keyword_id='%4'").arg("http://"+strUrl).arg(strUrl.split("/").at(1)).arg(str).arg(m_strKeywordID);
|
||||
strQuery += m_strTable;
|
||||
strQuery += QString(" set Url='%1',platform_id='%2',platform_title='%3',keyword_id='%4'").arg("http://"+strUrl).arg(strUrl.split("/").at(1)).arg(str).arg(m_strKeywordID);
|
||||
QString strUtf8(strQuery.toUtf8());
|
||||
if (sql.exec(strUtf8) == false)
|
||||
cout << "error : " << sql.lastError().text().toStdString();
|
||||
@@ -294,7 +269,7 @@ void SCrawler::saveFrameUrl(QWebFrame *frame)
|
||||
QString str = profile.toPlainText().split("\n").at(0);
|
||||
if (str.isEmpty() == false)
|
||||
{
|
||||
QString strQuery = "update " + m_strTableBody + " set NICKNAME = '";
|
||||
QString strQuery = "update " + m_strTable + " set NICKNAME = '";
|
||||
strQuery += str;
|
||||
strQuery += "'";
|
||||
strQuery += " where URL='";
|
||||
@@ -314,7 +289,6 @@ void SCrawler::saveFrameUrl(QWebFrame *frame)
|
||||
{
|
||||
QString str[E_DATA_MAX];
|
||||
QString strHead[E_DATA_MAX] = {"NickName","ArticleID","ArticleTitle","Date","Data","PlatformTitle"};
|
||||
/////////////////////////////////////////////
|
||||
QWebElement proTitle = Find(frame->documentElement(),"td","id","blogTitleText");
|
||||
str[E_DATA_PLATFORM_TITLE] = proTitle.toPlainText().trimmed();
|
||||
|
||||
@@ -397,15 +371,12 @@ void SCrawler::saveFrameUrl(QWebFrame *frame)
|
||||
}
|
||||
}
|
||||
|
||||
QString strQuery = "update " + m_strTableBody + " set ";
|
||||
QString strQuery = "update " + m_strTable + " set ";
|
||||
for(int i = 0; i < E_DATA_MAX ; i++)
|
||||
{
|
||||
str[i].replace("'","\\'");
|
||||
str[i].replace("\"","\\\"");
|
||||
str[i] = str[i].trimmed();
|
||||
strQuery += strHead[i];
|
||||
strQuery += "='";
|
||||
strQuery += str[i];
|
||||
strQuery += GetSafeUtf(SqlString(str[i].trimmed()));
|
||||
strQuery += "'";
|
||||
if( i != (E_DATA_MAX - 1) )
|
||||
strQuery += ",";
|
||||
@@ -453,7 +424,7 @@ void SCrawler::saveFrameComment(QWebFrame *frame)
|
||||
strComm = strComm.trimmed();
|
||||
|
||||
QSqlQuery query;
|
||||
query.prepare(QString("insert into " + m_strTableReply + " (Url,Nickname,Data,Date,Parent,UrlReply,RowNum) VALUES (:URL,:NICK,:DATA,:DATE,:PARENT,:URLREPLY,:ROWNUM)").toUtf8());
|
||||
query.prepare(QString("insert into " + m_strTable + " (Url,Nickname,Data,Date,Parent,UrlReply,RowNum) VALUES (:URL,:NICK,:DATA,:DATE,:PARENT,:URLREPLY,:ROWNUM)").toUtf8());
|
||||
query.bindValue(":URL", strUrl.toUtf8());
|
||||
query.bindValue(":NICK",strNick.toUtf8());
|
||||
query.bindValue(":DATA",strComm.toUtf8());
|
||||
@@ -486,7 +457,7 @@ void SCrawler::saveFrameComment(QWebFrame *frame)
|
||||
strComm.replace("\"","\\\"");
|
||||
strComm = strComm.trimmed();
|
||||
QSqlQuery query;
|
||||
query.prepare(QString("insert into " + m_strTableReply + " (Url,Nickname,Data,Date,Parent,UrlReply,RowNum) VALUES (:URL,:NICK,:DATA,:DATE,:PARENT,:URLREPLY,:ROWNUM)").toUtf8());
|
||||
query.prepare(QString("insert into " + m_strTable + " (Url,Nickname,Data,Date,Parent,UrlReply,RowNum) VALUES (:URL,:NICK,:DATA,:DATE,:PARENT,:URLREPLY,:ROWNUM)").toUtf8());
|
||||
query.bindValue(":URL",strUrl.toUtf8());
|
||||
query.bindValue(":NICK",strNick.toUtf8());
|
||||
query.bindValue(":DATA",strComm.toUtf8());
|
||||
@@ -516,32 +487,32 @@ void SCrawler::saveFrameCafeList(QWebFrame *frame)
|
||||
{
|
||||
if (eleSubUrl.attribute("class") == "url")
|
||||
strUrl = eleSubUrl.attribute("href");
|
||||
|
||||
if (eleSubUrl.attribute("class") == "sh_cafe_title")
|
||||
strTitle = eleSubUrl.toPlainText();
|
||||
strTitle = GetSafeUtf(SqlString(eleSubUrl.toPlainText().trimmed()));
|
||||
}
|
||||
|
||||
if (strUrl.split("/").at(2) == "cafe.naver.com")
|
||||
{
|
||||
cout << "url : " << strUrl.toStdString();
|
||||
{
|
||||
QSqlQuery sql;
|
||||
/*
|
||||
QString strQuery = "select URL from ";
|
||||
strQuery += m_strTableBody;
|
||||
strQuery += QString(" where URL = '%1'").arg(strUrl);
|
||||
sql.exec(strQuery);
|
||||
sql.exec(strQuery);
|
||||
if (sql.size() == 0)
|
||||
*/
|
||||
{
|
||||
QString strQuery = QString("insert into ");
|
||||
strQuery += m_strTableBody;
|
||||
strQuery += QString(" set Url='%1',PlatformID='%2',ArticleTitle='%3',keyword_id='%4'").arg(strUrl).arg(strUrl.split("/").at(3)).arg(strTitle).arg(m_strKeywordID);
|
||||
strQuery += m_strTable;
|
||||
strQuery += QString(" set platform_name='naver',platform_form='cafe',article_form='body',article_url='%1',platform_id='%2',article_title='%3',keyword_id='%4'").arg(strUrl).arg(strUrl.split("/").at(3)).arg(strTitle).arg(m_strKeywordID);
|
||||
QString strUtf8(strQuery.toUtf8());
|
||||
if (sql.exec(strUtf8) == false)
|
||||
cout << "error : " << sql.lastError().text().toStdString();
|
||||
cout << "x " << sql.lastError().text().toStdString();
|
||||
else
|
||||
cout << " ok" << endl;
|
||||
cout << "o " << strUrl.toStdString() << endl;
|
||||
}
|
||||
else
|
||||
cout << " overlap" << endl;
|
||||
}
|
||||
}
|
||||
m_bUse = true;
|
||||
}
|
||||
}
|
||||
@@ -566,10 +537,10 @@ void SCrawler::saveFrameCafeUrl(QWebFrame *frame)
|
||||
if (other.toPlainText().isEmpty() == false)
|
||||
{
|
||||
QString strQuery = "update ";
|
||||
strQuery += m_strTableBody;
|
||||
strQuery += m_strTable;
|
||||
strQuery += " set ";
|
||||
strQuery += "PlatformTitle = '" + SqlString(GetSafeUtf(other.toPlainText())) + "'";
|
||||
strQuery += "where URL='";
|
||||
strQuery += "platform_title = '" + SqlString(GetSafeUtf(other.toPlainText())) + "'";
|
||||
strQuery += "where article_url='";
|
||||
strQuery += m_strUrl;
|
||||
strQuery += "'";
|
||||
QString strUtf8(strQuery.toUtf8());
|
||||
@@ -577,18 +548,14 @@ void SCrawler::saveFrameCafeUrl(QWebFrame *frame)
|
||||
if (sql.exec(strUtf8) == false)
|
||||
cout << "error : " << sql.lastError().text().toStdString();
|
||||
}
|
||||
|
||||
if (frame->frameName() == "cafe_main")
|
||||
{
|
||||
// Main
|
||||
{
|
||||
{
|
||||
QString strData,strDate,strNick,strID,strViews;
|
||||
QString strData,strDate,strNick,strID,strHits;
|
||||
{
|
||||
QWebElement group = Find(frame->documentElement(),"div","class","tbody m-tcol-c");
|
||||
strData = SqlString(group.toPlainText().trimmed());
|
||||
|
||||
if (strData.size() >= 18430)
|
||||
strData = strData.left(18430);
|
||||
|
||||
strData = GetSafeUtf(strData);
|
||||
}
|
||||
{
|
||||
@@ -611,26 +578,40 @@ void SCrawler::saveFrameCafeUrl(QWebFrame *frame)
|
||||
if (strNick.isEmpty() == false)
|
||||
{
|
||||
QStringList list = strNick.split("(");
|
||||
strNick = list.at(0);
|
||||
strID = Find(Find(frame->documentElement(),"td","class","m-tcol-c b nick"),"a","class","m-tcol-c b").attribute("onclick").trimmed().split(",").at(1).trimmed().replace("'","");
|
||||
if (list.isEmpty() == false)
|
||||
strNick = list.at(0);
|
||||
QWebElement id = Find(frame->documentElement(),"td","class","m-tcol-c b nick");
|
||||
list = Find(id,"a","class","m-tcol-c b").attribute("onclick").trimmed().split(",");
|
||||
if (list.size() >= 2)
|
||||
strID = list.at(1).trimmed().replace("'","");
|
||||
}
|
||||
else
|
||||
{
|
||||
QStringList list = Find(Find(frame->documentElement(),"td","class","m-tcol-c b nick"),"a","class","m-tcol-c b").attribute("onclick").trimmed().split(",");
|
||||
if (list.size() >= 4)
|
||||
{
|
||||
strID = list.at(1).trimmed().replace("'","");
|
||||
strNick = list.at(3).trimmed().replace("'","");
|
||||
}
|
||||
}
|
||||
//qDebug() << "id : " << strID;
|
||||
|
||||
}
|
||||
strHits = Find(frame->documentElement(),"span","class","b m-tcol-c reply _rosReadcount").toPlainText();
|
||||
if (strHits.isEmpty())
|
||||
{
|
||||
strHits = Find(frame->documentElement(),"span","class","kin_count m-tcol-c _rosReadcount").toPlainText();
|
||||
}
|
||||
|
||||
strViews = Find(frame->documentElement(),"span","class","b m-tcol-c reply _rosReadcount").toPlainText();
|
||||
//qDebug() << "Count : " << strViews;
|
||||
|
||||
//if (strDate.isEmpty() == false)
|
||||
{
|
||||
QSqlQuery sql;
|
||||
QString strQuery = "update ";
|
||||
strQuery += m_strTableBody;
|
||||
strQuery += m_strTable;
|
||||
strQuery += " set ";
|
||||
strQuery += "Data = '" + strData + "',";
|
||||
strQuery += "Date = '" + strDate + "',";
|
||||
strQuery += "Nickname = '" + strNick + "',";
|
||||
strQuery += "ArticleID = '" + strID + "'";
|
||||
strQuery += "where URL='";
|
||||
strQuery += "article_data = '" + strData + "',";
|
||||
strQuery += "article_date = '" + strDate + "',";
|
||||
strQuery += "article_nickname = '" + strNick + "',";
|
||||
strQuery += "article_id = '" + strID + "',";
|
||||
strQuery += "article_hit = '" + strHits + "'";
|
||||
strQuery += "where article_url='";
|
||||
strQuery += m_strUrl;
|
||||
strQuery += "'";
|
||||
QString strUtf8(strQuery.toUtf8());
|
||||
@@ -649,17 +630,18 @@ void SCrawler::saveFrameCafeUrl(QWebFrame *frame)
|
||||
if (element.attribute("class").isEmpty())
|
||||
{
|
||||
QString strData = SqlString(Find(element,"span","class","comm_body").toPlainText().trimmed());
|
||||
if (strData.isEmpty()) continue;
|
||||
strData = GetSafeUtf(strData);
|
||||
|
||||
QString strID = Find(element,"input","name","writerid").attribute("value").trimmed();
|
||||
QString strNick = Find(element,"td","class","p-nick").toPlainText().trimmed();
|
||||
strParent = strNick;
|
||||
QString strDate = Find(element,"span","class","date m-tcol-c filter-50").toPlainText().trimmed();
|
||||
if (strDate.isEmpty()) continue;
|
||||
QString strID = Find(element,"input","name","writerid").attribute("value").trimmed();
|
||||
//qDebug() << strID;
|
||||
QSqlQuery query;
|
||||
query.prepare(QString("insert into " + m_strTableReply + " (Url,NickName,Data,Date,UrlReply,RowNum) VALUES (:URL,:NICK,:DATA,:DATE,:URLREPLY,:ROWNUM)").toUtf8());
|
||||
|
||||
QSqlQuery query;
|
||||
query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,reply_url,article_order) VALUES ('naver','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:URLREPLY,:ROWNUM)").toUtf8());
|
||||
query.bindValue(":URL",m_strUrl.toUtf8());
|
||||
query.bindValue(":ID",strID.toUtf8());
|
||||
query.bindValue(":NICK",strNick.toUtf8());
|
||||
query.bindValue(":DATA",strData.toUtf8());
|
||||
query.bindValue(":DATE",strDate.toUtf8());
|
||||
@@ -673,19 +655,17 @@ void SCrawler::saveFrameCafeUrl(QWebFrame *frame)
|
||||
{
|
||||
QString strData = Find(element,"span","class","comm_body").toPlainText().trimmed();
|
||||
if (strData.isEmpty()) continue;
|
||||
|
||||
QString strReParent = strParent;
|
||||
QString strID = Find(element,"input","name","writerid").attribute("value").trimmed();
|
||||
QString strNick = Find(element,"td","class","p-nick").toPlainText().trimmed();
|
||||
QString strDate = Find(element,"span","class","date m-tcol-c filter-50").toPlainText().trimmed();
|
||||
QWebElement eleParent = Find(element,"span","class","re-p-nick");
|
||||
if (eleParent.toPlainText().isEmpty() == false)
|
||||
strReParent = eleParent.toPlainText();
|
||||
QString strID = Find(element,"input","name","writerid").attribute("value").trimmed();
|
||||
//qDebug() << strID;
|
||||
|
||||
QSqlQuery query;
|
||||
query.prepare(QString("insert into " + m_strTableReply + " (Url,NickName,Data,Date,Parent,UrlReply,RowNum) VALUES (:URL,:NICK,:DATA,:DATE,:PARENT,:URLREPLY,:ROWNUM)").toUtf8());
|
||||
query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,article_parent,reply_url,article_order) VALUES ('naver','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:PARENT,:URLREPLY,:ROWNUM)").toUtf8());
|
||||
query.bindValue(":URL",m_strUrl.toUtf8());
|
||||
query.bindValue(":ID",strID.toUtf8());
|
||||
query.bindValue(":NICK",strNick.toUtf8());
|
||||
query.bindValue(":DATA",strData.toUtf8());
|
||||
query.bindValue(":DATE",strDate.toUtf8());
|
||||
@@ -695,12 +675,11 @@ void SCrawler::saveFrameCafeUrl(QWebFrame *frame)
|
||||
|
||||
if (query.exec()==false)
|
||||
cout << "error : " << query.lastError().text().toStdString();
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
m_bUse = true;
|
||||
}
|
||||
}
|
||||
foreach(QWebFrame *childFrame, frame->childFrames())
|
||||
saveFrameCafeUrl(childFrame);
|
||||
}
|
||||
@@ -718,4 +697,3 @@ QWebElement SCrawler::Find(const QWebElement _FindElement,const QString _strElem
|
||||
QWebElement element;
|
||||
return element;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user