diff --git a/CrawlerProcess/CrawlerProcess.pro b/CrawlerProcess/CrawlerProcess.pro index b4b810c..4d469ca 100644 --- a/CrawlerProcess/CrawlerProcess.pro +++ b/CrawlerProcess/CrawlerProcess.pro @@ -12,7 +12,8 @@ CONFIG -= app_bundle TEMPLATE = app SOURCES += main.cpp \ - scrawler.cpp + scrawler.cpp \ + scrawler_backup.cpp HEADERS += \ scrawler.h diff --git a/CrawlerProcess/scrawler.cpp b/CrawlerProcess/scrawler.cpp index f1b473e..0fd56e3 100644 --- a/CrawlerProcess/scrawler.cpp +++ b/CrawlerProcess/scrawler.cpp @@ -75,6 +75,7 @@ void SCrawler::load(QStringList _strlistArgv) m_strUrl = _strlistArgv[2]; m_nSelect = E_NAVER_BLOG_LIST; m_strKeywordID = _strlistArgv[4]; + //cout << "ok"; } if (_strlistArgv[1] == "blog_url") @@ -245,12 +246,12 @@ void SCrawler::saveFrameList(QWebFrame *frame) QStringList strList = strUrl.split('/'); if (strList.at(0).compare("blog.naver.com") != 0 ) { cout << " not" << endl; continue; }; - QString strQuery = "select URL from "; + QString strQuery = "select article_url from "; strQuery += m_strTable; - strQuery += QString(" where URL = '%1'").arg(strUrl); + strQuery += QString(" where article_url = '%1'").arg(strUrl); sql.exec(strQuery); - if (sql.size() == 0) + if (sql.size() == -1) { QString str = Find(sub,"a","class","txt84").toPlainText(); str = GetSafeUtf(str); @@ -260,7 +261,7 @@ void SCrawler::saveFrameList(QWebFrame *frame) QString strQuery = QString("insert into "); strQuery += m_strTable; - strQuery += QString(" set Url='%1',platform_id='%2',platform_title='%3',keyword_id='%4'").arg("http://"+strUrl).arg(strUrl.split("/").at(1)).arg(str).arg(m_strKeywordID); + strQuery += QString(" set article_url='%1',platform_id='%2',platform_title='%3',keyword_id='%4'").arg("http://"+strUrl).arg(strUrl.split("/").at(1)).arg(str).arg(m_strKeywordID); QString strUtf8(strQuery.toUtf8()); if (sql.exec(strUtf8) == false) cout << "error : " << sql.lastError().text().toStdString(); @@ -308,10 +309,10 @@ void SCrawler::saveFrameUrl(QWebFrame *frame) QString str = profile.toPlainText().split("\n").at(0); if (str.isEmpty() == false) { - QString strQuery = "update " + m_strTable + " set NICKNAME = '"; + QString strQuery = "update " + m_strTable + " set article_nickname = '"; strQuery += str; strQuery += "'"; - strQuery += " where URL='"; + strQuery += " where article_url='"; strQuery += m_strUrl; strQuery += "'"; QString strUtf8(strQuery.toUtf8()); @@ -327,9 +328,10 @@ void SCrawler::saveFrameUrl(QWebFrame *frame) if (frame->frameName().compare(QString("mainFrame")) == 0) { QString str[E_DATA_MAX]; - QString strHead[E_DATA_MAX] = {"NickName","ArticleID","ArticleTitle","Date","Data","PlatformTitle"}; + //QString strHead[E_DATA_MAX] = {"article_nickname","article_id","article_title","article_date","article_data","platform_title"}; + QString strHead[E_DATA_MAX] = {"article_nickname","article_id","article_title","article_date","article_data"}; QWebElement proTitle = Find(frame->documentElement(),"td","id","blogTitleText"); - str[E_DATA_PLATFORM_TITLE] = proTitle.toPlainText().trimmed(); + // str[E_DATA_PLATFORM_TITLE] = proTitle.toPlainText().trimmed(); QWebElement profile = Find(frame->documentElement(),"div","id","blog-profile"); { @@ -389,8 +391,12 @@ void SCrawler::saveFrameUrl(QWebFrame *frame) { QWebElement date = Find(post_top,"p","class","date fil5 pcol2 _postAddDate"); - if (date.toPlainText().isEmpty() == false) - str[E_DATA_DATE] = date.toPlainText(); + str[E_DATA_DATE] = date.toPlainText().trimmed().replace("/","-"); + if ( str[E_DATA_DATE].isEmpty() == false) + { + str[E_DATA_DATE] += ":00"; + cout << "str[E_DATA_DATE] = " << str[E_DATA_DATE].toStdString() << endl; + } else { UpdateError("Error code 4"); @@ -403,28 +409,28 @@ void SCrawler::saveFrameUrl(QWebFrame *frame) if (body.toPlainText().isEmpty()==false) { str[E_DATA_DATA] = body.toPlainText(); - if (str[E_DATA_DATA].size() >= 18430) - str[E_DATA_DATA] = str[E_DATA_DATA].left(18430); str[E_DATA_DATA] = GetSafeUtf(str[E_DATA_DATA]); } } } QString strQuery = "update " + m_strTable + " set "; - for(int i = 0; i < E_DATA_MAX ; i++) + for(int i = 0; i < E_DATA_MAX - 1 ; i++) { strQuery += strHead[i]; strQuery += "='"; - strQuery += GetSafeUtf(SqlString(str[i].trimmed())); + //strQuery += GetSafeUtf(SqlString(str[i].trimmed())); + strQuery += str[i].trimmed(); strQuery += "'"; - if( i != (E_DATA_MAX - 1) ) + if( i != (E_DATA_MAX - 2) ) strQuery += ","; } - strQuery += " where URL='"; + strQuery += " where article_url='"; strQuery += m_strUrl; strQuery += "'"; QString strUtf8(strQuery.toUtf8()); //qDebug() << strQuery; + //cout << "Query : " << strQuery.toStdString() << endl; if (sql.exec(strUtf8)==false) { cout << "error : " << sql.lastError().text().toStdString(); @@ -455,25 +461,31 @@ void SCrawler::saveFrameComment(QWebFrame *frame) strNick = strParent = Find(element,"a","class","nick pcol2").toPlainText(); strDate = Find(element,"span","class","date fil5 pcol2").toPlainText(); strComm = Find(element,"dd","class","comm pcol2").toPlainText(); - strComm = GetSafeUtf(strComm); + //strComm = GetSafeUtf(strComm); if (strComm.isEmpty()== false) { strComm.replace("'","\\'"); strComm.replace("\"","\\\""); strComm = strComm.trimmed(); - + //cout << strComm.toStdString() << endl; QSqlQuery query; - query.prepare(QString("insert into " + m_strTable + " (Url,Nickname,Data,Date,Parent,UrlReply,RowNum) VALUES (:URL,:NICK,:DATA,:DATE,:PARENT,:URLREPLY,:ROWNUM)").toUtf8()); - query.bindValue(":URL", strUrl.toUtf8()); - query.bindValue(":NICK",strNick.toUtf8()); - query.bindValue(":DATA",strComm.toUtf8()); - query.bindValue(":DATE",strDate.toUtf8()); - query.bindValue(":PARENT",QString("").toUtf8()); - query.bindValue(":URLREPLY",m_strUrl.toUtf8()); - query.bindValue(":ROWNUM",QString::number(nCount++).toUtf8()); + query.prepare(QString("insert into " + m_strTable + " (article_url,article_nickname,article_data,article_date,article_parent,reply_url,article_order) VALUES (:URL,:NICK,:DATA,:DATE,:PARENT,:URLREPLY,:ROWNUM)").toUtf8()); + cout << "m_strTable = " << m_strTable.toStdString() << endl; + cout << "url = " << strUrl.toStdString() << endl << "nickname = " << strNick.toStdString() << endl; + cout << "data = " << strComm.toStdString() << endl; + cout << "date = " << strDate.toStdString() << endl; + cout << "urlReply = " << m_strUrl.toStdString() << endl; + cout << "ronum = " << nCount << endl; + query.bindValue(":URL", strUrl.toUtf8()); + query.bindValue(":NICK",strNick.toUtf8()); + query.bindValue(":DATA",strComm.toUtf8()); + query.bindValue(":DATE",strDate.toUtf8()); + query.bindValue(":PARENT",QString("").toUtf8()); + query.bindValue(":URLREPLY",m_strUrl.toUtf8()); + query.bindValue(":ROWNUM",(nCount++)); - if (query.exec()==false) - cout << "error : " << query.lastError().text().toStdString(); + if (query.exec()==false) + cout << "error : " << query.lastError().text().toStdString(); } } if (element.attribute("class") == "reply _countableComment ") @@ -491,19 +503,26 @@ void SCrawler::saveFrameComment(QWebFrame *frame) if (strComm.isEmpty() == false) { - strComm = GetSafeUtf(strComm); + // strComm = GetSafeUtf(strComm); strComm.replace("'","\\'"); strComm.replace("\"","\\\""); strComm = strComm.trimmed(); QSqlQuery query; - query.prepare(QString("insert into " + m_strTable + " (Url,Nickname,Data,Date,Parent,UrlReply,RowNum) VALUES (:URL,:NICK,:DATA,:DATE,:PARENT,:URLREPLY,:ROWNUM)").toUtf8()); + cout << "m_strTable = " << m_strTable.toStdString() << endl; + cout << "url = " << strUrl.toStdString() << endl << "nickname = " << strNick.toStdString() << endl; + cout << "data = " << strComm.toStdString() << endl; + cout << "date = " << strDate.toStdString() << endl; + cout << "urlReply = " << m_strUrl.toStdString() << endl; + cout << "ronum = " << nCount << endl; + cout << "parent = " << strParent.toStdString() << endl; + query.prepare(QString("insert into " + m_strTable + " (article_url,article_nickname,article_data,article_date,article_parent,reply_url,article_order) VALUES (:URL,:NICK,:DATA,:DATE,:PARENT,:URLREPLY,:ROWNUM)").toUtf8()); query.bindValue(":URL",strUrl.toUtf8()); query.bindValue(":NICK",strNick.toUtf8()); query.bindValue(":DATA",strComm.toUtf8()); query.bindValue(":DATE",strDate.toUtf8()); query.bindValue(":PARENT",strParent.toUtf8()); query.bindValue(":URLREPLY",m_strUrl.toUtf8()); - query.bindValue(":ROWNUM",QString::number(nCount++).toUtf8()); + query.bindValue(":ROWNUM",(nCount++)); if (query.exec()==false) { cout << "error : " << query.lastError().text().toStdString(); diff --git a/NaverBlogCrawler/snaverblogmanage.cpp b/NaverBlogCrawler/snaverblogmanage.cpp index f7dd30c..bff6956 100644 --- a/NaverBlogCrawler/snaverblogmanage.cpp +++ b/NaverBlogCrawler/snaverblogmanage.cpp @@ -17,9 +17,14 @@ QString SNaverBlogManage::makeGetListQuery(QString _str,QDate _date) //http://cafeblog.search.naver.com/search.naver?where=post&sm=tab_pge&query=%EC%95%84%EC%9D%B4%ED%8F%B0&st=date&date_option=6&date_from=20131103&date_to=20131103&dup_remove=1&post_blogurl=&post_blogurl_without=&srchby=all&nso=so%3Add%2Cp%3Afrom20131103to20131103&ie=utf8&start=11 str = "http://cafeblog.search.naver.com/search.naver?where=post&sm=tab_pge&query="; str += EncodetoUtf8(_str,true); - str += "&st=date&date_option=6&date_from=" + strDate + "&date_to=" + strDate ; + //str += "&st=date&date_option=6&date_from=" + strDate + "&date_to=" + strDate ; + str += "&st=date&date_option=6&date_from="; + str += strDate; + str += "&date_to="; + str += strDate ; str += "&dup_remove=1&post_blogurl=&post_blogurl_without=&srchby=all&nso=so%3Add%2Cp%3Afrom"; - str += strDate + "to" + strDate +"&ie=utf8&start=" + QString::number(m_ncList); + str += strDate + "to" + strDate +"&ie=utf8&start="; + str += QString::number(m_ncList); return str; } @@ -54,7 +59,7 @@ void SNaverBlogManage::processFinished(QProcess *pPro,QString _strOut) m_bLast = true; m_strListURL.clear(); QSqlQuery query; - if(query.exec("SELECT URL FROM " + C_TABLE_URL + QString::number(m_nUrlTable) + " where ERROR is null")) + if(query.exec("SELECT ARTICLE_URL FROM " + C_TABLE_URL + QString::number(m_nUrlTable) + " where ERROR is null")) { m_pMain->InsertLog(m_nID,query.lastError().text()); }