git-svn-id: svn://192.168.0.12/source@32 8346c931-da38-4b9b-9d4c-e48b93cbd075
This commit is contained in:
323
CrawlerList/snaverblogmanage.cpp
Normal file
323
CrawlerList/snaverblogmanage.cpp
Normal file
@@ -0,0 +1,323 @@
|
||||
#include "snaverblogmanage.h"
|
||||
#include "widget.h"
|
||||
#include <QSqlQuery>
|
||||
#include <QSqlError>
|
||||
#include <qDebug>
|
||||
#include <QFile>
|
||||
#include <QTextStream>
|
||||
#include <QNetworkProxy>
|
||||
|
||||
SNaverBlogManage::SNaverBlogManage(QObject *pObject) : SManage(pObject) , C_TABLE_URL("NAVER_BLOG_BODY_") , C_TABLE_COM("NAVER_BLOG_REPLY_")
|
||||
{
|
||||
m_nID = 0;
|
||||
}
|
||||
|
||||
QString SNaverBlogManage::makeGetListQuery(QString _str,QDate _date)
|
||||
{
|
||||
QString str;
|
||||
QString strDate = _date.toString("yyyyMMdd");
|
||||
//http://cafeblog.search.naver.com/search.naver?where=post&sm=tab_pge&query=%EC%95%84%EC%9D%B4%ED%8F%B0&st=date&date_option=6&date_from=20131103&date_to=20131103&dup_remove=1&post_blogurl=&post_blogurl_without=&srchby=all&nso=so%3Add%2Cp%3Afrom20131103to20131103&ie=utf8&start=11
|
||||
str = "http://cafeblog.search.naver.com/search.naver?where=post&sm=tab_pge&query=";
|
||||
str += EncodetoUtf8(_str,true);
|
||||
//str += "&st=date&date_option=6&date_from=" + strDate + "&date_to=" + strDate ;
|
||||
str += "&st=date&date_option=6&date_from=";
|
||||
str += strDate;
|
||||
str += "&date_to=";
|
||||
str += strDate ;
|
||||
str += "&dup_remove=1&post_blogurl=&post_blogurl_without=&srchby=all&nso=so%3Add%2Cp%3Afrom";
|
||||
str += strDate + "to" + strDate +"&ie=utf8&start=";
|
||||
str += QString::number(m_ncList);
|
||||
return str;
|
||||
}
|
||||
|
||||
QString SNaverBlogManage::makeGetCommentQuery(QString _strUrl)
|
||||
{
|
||||
//http://blog.naver.com/kohaku3533/220149821481/CommentList.nhn?blogId=kohaku3533&logNo=220149821481¤tPage=&isMemolog=false&focusingCommentNo=&showLastPage=true&shortestContentAreaWidth=false;
|
||||
QStringList strList = _strUrl.split("/");
|
||||
QString strOut = _strUrl;
|
||||
strOut += "/CommentList.nhn?blogId=";
|
||||
strOut += strList.at(3);
|
||||
strOut += "&logNo=";
|
||||
strOut += strList.at(4);
|
||||
strOut += "¤tPage=&isMemolog=false&focusingCommentNo=&showLastPage=true&shortestContentAreaWidth=false";
|
||||
return strOut;
|
||||
}
|
||||
|
||||
void SNaverBlogManage::Start()
|
||||
{
|
||||
m_nMode = E_PROCESS_LIST_RUN;
|
||||
m_ncList = 1;
|
||||
m_bFinalLast = false;
|
||||
}
|
||||
|
||||
void SNaverBlogManage::processFinished(QProcess *pPro,QString _strOut)
|
||||
{
|
||||
switch(m_nMode)
|
||||
{
|
||||
case E_PROCESS_LIST_FINISH_WAIT:
|
||||
{
|
||||
m_bLast = false;
|
||||
if (_strOut.right(4) == "last" || m_ncList >= 991)
|
||||
m_bLast = true;
|
||||
m_strListURL.clear();
|
||||
foreach(QString str,_strOut.split("\n"))
|
||||
{
|
||||
if (str.isEmpty()) continue;
|
||||
if (str.at(0) == QChar('o'))
|
||||
m_strListURL.push_back(str.right(str.length()-2).trimmed());
|
||||
}
|
||||
/*
|
||||
QSqlQuery query;
|
||||
if(query.exec("SELECT ARTICLE_URL FROM " + C_TABLE_URL + QString::number(m_nUrlTable) + " where ERROR is null"))
|
||||
{
|
||||
m_pMain->InsertLog(m_nID,query.lastError().text());
|
||||
}
|
||||
|
||||
while (query.next())
|
||||
m_strListURL.append(query.value(0).toString());
|
||||
*/
|
||||
m_ncUrl = 0;
|
||||
if (m_strListURL.size() == 0)
|
||||
{
|
||||
m_nMode = E_PROCESS_LIST_RUN;
|
||||
CheckLast();
|
||||
}
|
||||
else
|
||||
m_nMode = E_PROCESS_URL_RUN;
|
||||
break;
|
||||
}
|
||||
case E_PROCESS_URL_FINISH_WAIT:
|
||||
if (UseProcess() == false)
|
||||
{
|
||||
m_nMode = E_PROCESS_COMMENT_RUN;
|
||||
}
|
||||
break;
|
||||
case E_PROCESS_COMMENT_FINISH_WAIT:
|
||||
//if (UseProcess() == false)
|
||||
if (m_ncUrl >= m_strListURL.size())
|
||||
{
|
||||
m_nMode = E_PROCESS_LIST_RUN;
|
||||
CheckLast();
|
||||
m_bLast = false;
|
||||
}
|
||||
else
|
||||
m_nMode = E_PROCESS_URL_RUN;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
bool SNaverBlogManage::Update()
|
||||
{
|
||||
//m_pMain->InsertLog("Blog Update start");
|
||||
if (m_bFinalLast) return m_bFinalLast;
|
||||
switch(m_nMode)
|
||||
{
|
||||
case E_PROCESS_LIST_RUN:
|
||||
if (UseProcess() == false && CheckTime())
|
||||
{
|
||||
m_strQuery = makeGetListQuery(m_strKeyword,m_date);
|
||||
//m_pMain->InsertLog(m_nID,"Start : " + QString::number(m_ncList) + " Date : " + m_date.toString("yyyy-MM-dd"));
|
||||
m_pMain->InsertLog("Start : " + QString::number(m_ncList) + " Date : " + m_date.toString("yyyy-MM-dd"));
|
||||
m_pMain->InsertLog(m_strQuery);
|
||||
//for(int i = 0; i < C_PROCESS_MAX ; i++)
|
||||
{
|
||||
m_pro[1].start("CrawlerProcess",QStringList()<< "naver" << "blog_list" << m_strQuery << m_strGroupID << m_strKeywordID);
|
||||
// m_pro[1].SetState(SProcess::STATE_RUNNING);
|
||||
m_pMain->InsertLog("m_pro[1] is started");
|
||||
|
||||
m_ncList+=10;
|
||||
}
|
||||
m_nMode = E_PROCESS_LIST_FINISH_WAIT;
|
||||
m_nWait = 0;
|
||||
}
|
||||
break;
|
||||
case E_PROCESS_URL_RUN:
|
||||
if (UseProcess() == false && CheckTime())
|
||||
{
|
||||
/*
|
||||
for(int i = 0; i < C_PROCESS_MAX ; i++)
|
||||
{
|
||||
m_pro[1].start("CrawlerProcess",QStringList() << m_strListURL.at(m_ncUrl) << "url");
|
||||
}
|
||||
|
||||
int i = 0;
|
||||
foreach(QString strUrl,m_strListURL)
|
||||
//for(int i = 0; i < C_PROCESS_MAX ; i++)
|
||||
{
|
||||
m_pro[i++].start("CrawlerProcess",QStringList() << strUrl << "url" << C_TABLE_URL + QString::number(m_nUrlTable));
|
||||
if (i >= C_PROCESS_MAX) break;
|
||||
}
|
||||
*/
|
||||
//m_pro[1].start("CrawlerProcess",QStringList() << m_strListURL.at(m_ncUrl) << "blog_url" << C_TABLE_URL + QString::number(m_nUrlTable));
|
||||
//m_pMain->InsertLog(m_nID,"(" + QString::number(m_ncUrl+1) + "/" + QString::number(m_strListURL.size()) + ")");
|
||||
m_pMain->InsertLog("(" + QString::number(m_ncUrl+1) + "/" + QString::number(m_strListURL.size()) + ")");
|
||||
m_pro[1].start("CrawlerProcess",QStringList() << "naver" << "blog_url" << m_strListURL.at(m_ncUrl) << m_strGroupID << "" );
|
||||
// m_pro[1].SetState(SProcess::STATE_RUNNING);
|
||||
m_nMode = E_PROCESS_URL_FINISH_WAIT;
|
||||
m_nWait = 0;
|
||||
}
|
||||
break;
|
||||
case E_PROCESS_COMMENT_RUN:
|
||||
if (UseProcess() == false && CheckTime())
|
||||
{
|
||||
//int i=0;
|
||||
//foreach(QString strUrl,m_strListURL)
|
||||
{
|
||||
//m_pro[i++].start("CrawlerProcess",QStringList() << makeGetCommentQuery(strUrl)<< "comment" << C_TABLE_COM + QString::number(m_nUrlTable));
|
||||
//if (i >= C_PROCESS_MAX) break;
|
||||
//m_ncUrl++;
|
||||
}
|
||||
//m_pMain->InsertLog(m_nID,"(" + QString::number(m_ncUrl+1) + "/" + QString::number(m_strListURL.size()) + ")");
|
||||
m_pro[1].start("CrawlerProcess",QStringList() << "naver" << "blog_comm" << makeGetCommentQuery(m_strListURL.at(m_ncUrl++)) << m_strGroupID << "" );
|
||||
//m_pro[1].start("CrawlerProcess",QStringList() << makeGetCommentQuery(m_strListURL.at(m_ncUrl++))<< "blog_comm" << C_TABLE_COM + QString::number(m_nUrlTable));
|
||||
//m_pro[1].SetState(SProcess::STATE_RUNNING);
|
||||
m_nMode = E_PROCESS_COMMENT_FINISH_WAIT;
|
||||
m_nWait = 0;
|
||||
}
|
||||
break;
|
||||
case E_PROCESS_LIST_FINISH_WAIT:
|
||||
case E_PROCESS_URL_FINISH_WAIT:
|
||||
case E_PROCESS_COMMENT_FINISH_WAIT:
|
||||
m_nWait++;
|
||||
if (m_nWait >=60)
|
||||
{
|
||||
//for(int i = 0; i < C_PROCESS_MAX ; i++)
|
||||
{
|
||||
{
|
||||
m_pro[1].kill();
|
||||
//m_pMain->InsertLog(m_nID,"Kill Process.");
|
||||
m_pMain->InsertLog("Kill Process.");
|
||||
}
|
||||
}
|
||||
if (m_nMode == E_PROCESS_LIST_FINISH_WAIT) return m_bFinalLast;
|
||||
/*
|
||||
QString strQuery = "update blog set ";
|
||||
strQuery += "Error ";
|
||||
strQuery += "='";
|
||||
strQuery += "Kill Process";
|
||||
strQuery += "'";
|
||||
strQuery += " where URL='";
|
||||
if (m_nMode == E_PROCESS_COMMENT_FINISH_WAIT)
|
||||
strQuery += m_strListURL.at(m_ncUrl-1);
|
||||
else
|
||||
strQuery += m_strListURL.at(m_ncUrl);
|
||||
strQuery += "'";
|
||||
QSqlQuery sql;
|
||||
sql.exec(strQuery);
|
||||
*/
|
||||
}
|
||||
break;
|
||||
}
|
||||
return m_bFinalLast;
|
||||
}
|
||||
|
||||
void SNaverBlogManage::MakeTables()
|
||||
{
|
||||
QString strQuery = "show tables";
|
||||
QSqlQuery query;
|
||||
query.exec(strQuery);
|
||||
int nUrlMax = -1;
|
||||
while (query.next())
|
||||
{
|
||||
QString str = query.value(0).toString();
|
||||
if (str.left(C_TABLE_URL.size()) == C_TABLE_URL.toUpper())
|
||||
{
|
||||
if (nUrlMax < str.mid(C_TABLE_URL.size()).toInt())
|
||||
nUrlMax = str.mid(C_TABLE_URL.size()).toInt();
|
||||
}
|
||||
}
|
||||
m_nUrlTable = nUrlMax + 1;
|
||||
strQuery = "Create table " + C_TABLE_URL + QString::number(m_nUrlTable)+ "(Url CHAR(128) not null primary key,keyword_id INT,PlatformTitle CHAR(128),PlatformID CHAR(64),ArticleTitle VARCHAR(128),ArticleID CHAR(32),Date DATETIME,Nickname CHAR(32),Data VARCHAR(18432),Error CHAR(32)) CHARSET=utf8";
|
||||
query.exec(strQuery);
|
||||
strQuery = "Create table " + C_TABLE_COM + QString::number(m_nUrlTable)+ "(Url CHAR(128) not null,Nickname CHAR(32),Data VARCHAR(1024),Parent CHAR(64),Date DATETIME,UrlReply VARCHAR(512),RowNum INT) CHARSET=utf8";
|
||||
query.exec(strQuery);
|
||||
/*
|
||||
strQuery = "Create table " + C_TABLE_URL + QString::number(m_nUrlTable)+ "(URL varchar(128) not null primary key , ID varchar(64) ,NICK varchar(64),TITLE varchar(256), DATA varchar(20480) , DATE char(32) , BLOG_ID varchar(64) , OTHER varchar(128), ERROR char(128)) CHARSET=utf8";
|
||||
query.exec(strQuery);
|
||||
strQuery = "Create table " + C_TABLE_COM + QString::number(m_nUrlTable)+ "(URL varchar(128) not null, NICK varchar(64),DATA varchar(20480),PARENT varchar(64),DATE char(32),URL_COMMENT varchar(512) not null,URL_NICK varchar(128)) CHARSET=utf8";
|
||||
query.exec(strQuery);
|
||||
*/
|
||||
|
||||
m_pMain->setWindowTitle("NaverBlogCrawler " + QString::number(m_nUrlTable));
|
||||
}
|
||||
|
||||
void SNaverBlogManage::DropTables()
|
||||
{
|
||||
QString strQuery = "drop table ";
|
||||
QSqlQuery query;
|
||||
query.exec(strQuery + C_TABLE_URL + QString::number(m_nUrlTable));
|
||||
query.exec(strQuery + C_TABLE_COM + QString::number(m_nUrlTable));
|
||||
}
|
||||
|
||||
void SNaverBlogManage::Join()
|
||||
{
|
||||
//m_pMain->InsertLog(m_nID,"Insert Article Data...");
|
||||
m_pMain->InsertLog("Insert Article Data...");
|
||||
QString strQuery = "insert into "
|
||||
"data_" + m_strGroupID +
|
||||
"(platformname , platformform , articleform ,"
|
||||
"url , keyword_id , body_platformtitle , body_platformid , body_articletitle , body_articleid , body_date , body_nickname , body_data)"
|
||||
"select "
|
||||
"CONVERT('naver' USING utf8),"
|
||||
"CONVERT('blog' USING utf8),"
|
||||
"CONVERT('article' USING utf8),"
|
||||
"CONVERT(url USING utf8),"
|
||||
"CONVERT(keyword_id USING utf8),"
|
||||
"CONVERT(PlatformTitle USING utf8),"
|
||||
"CONVERT(PlatformID USING utf8),"
|
||||
"CONVERT(ArticleTitle USING utf8),"
|
||||
"CONVERT(ArticleID USING utf8),"
|
||||
"CONVERT(Date USING utf8),"
|
||||
"CONVERT(Nickname USING utf8),"
|
||||
"CONVERT(Data USING utf8)"
|
||||
"from NAVER_BLOG_BODY_" + QString::number(m_nUrlTable);
|
||||
QSqlQuery query;
|
||||
if (query.exec(strQuery) == false)
|
||||
{
|
||||
//m_pMain->InsertLog(m_nID,query.lastError().text());
|
||||
m_pMain->InsertLog(query.lastError().text());
|
||||
return;
|
||||
}
|
||||
|
||||
//m_pMain->InsertLog(m_nID,"Insert Reply Data...");
|
||||
m_pMain->InsertLog("Insert Reply Data...");
|
||||
|
||||
strQuery = "insert into "
|
||||
"data_" + m_strGroupID +
|
||||
"(platformname , platformform , articleform ,"
|
||||
"url , keyword_id , body_platformtitle , body_platformid , body_articletitle , body_articleid , body_date , body_nickname , body_data ,"
|
||||
"reply_nickname ,reply_data, reply_parent , reply_date ,reply_urlreply ,reply_rownum )"
|
||||
"select "
|
||||
"CONVERT('naver' USING utf8),"
|
||||
"CONVERT('blog' USING utf8),"
|
||||
"CONVERT('reply' USING utf8),"
|
||||
"CONVERT(_body.url USING utf8),"
|
||||
"CONVERT(_body.keyword_id USING utf8),"
|
||||
"CONVERT(_body.PlatformTitle USING utf8),"
|
||||
"CONVERT(_body.PlatformID USING utf8),"
|
||||
"CONVERT(_body.ArticleTitle USING utf8),"
|
||||
"CONVERT(_body.ArticleID USING utf8),"
|
||||
"CONVERT(_body.Date USING utf8),"
|
||||
"CONVERT(_body.Nickname USING utf8),"
|
||||
"CONVERT(_body.Data USING utf8),"
|
||||
"CONVERT(_reply.Nickname USING utf8),"
|
||||
"CONVERT(_reply.Data USING utf8),"
|
||||
"CONVERT(_reply.Parent USING utf8),"
|
||||
"CONVERT(_reply.Date USING utf8),"
|
||||
"CONVERT(_reply.UrlReply USING utf8),"
|
||||
"CONVERT(_reply.RowNum USING utf8) "
|
||||
"from " + C_TABLE_URL + QString::number(m_nUrlTable) + " _body INNER JOIN " + C_TABLE_COM + QString::number(m_nUrlTable) + " _reply ON _body.Url = _reply.Url";
|
||||
|
||||
if (query.exec(strQuery) == false)
|
||||
{
|
||||
//m_pMain->InsertLog(m_nID,query.lastError().text());
|
||||
m_pMain->InsertLog(query.lastError().text());
|
||||
return;
|
||||
}
|
||||
//m_pMain->InsertLog(m_nID,"Delete data ...");
|
||||
m_pMain->InsertLog("Delete data ...");
|
||||
query.exec("delete from NAVER_BLOG_BODY_" + QString::number(m_nUrlTable) );
|
||||
query.exec("delete from NAVER_BLOG_REPLY_" + QString::number(m_nUrlTable) );
|
||||
//m_pMain->InsertLog(m_nID,"Finish ... ");
|
||||
m_pMain->InsertLog("Finish ... ");
|
||||
}
|
||||
39
CrawlerList/snaverblogmanage.h
Normal file
39
CrawlerList/snaverblogmanage.h
Normal file
@@ -0,0 +1,39 @@
|
||||
#ifndef SNAVERBLOGMANAGE_H
|
||||
#define SNAVERBLOGMANAGE_H
|
||||
#include "SManage.h"
|
||||
|
||||
class SNaverBlogManage : public SManage
|
||||
{
|
||||
public:
|
||||
enum E_PROCESS_STATE
|
||||
{
|
||||
E_PROCESS_LIST_RUN = 0,
|
||||
E_PROCESS_LIST_FINISH_WAIT,
|
||||
E_PROCESS_URL_RUN,
|
||||
E_PROCESS_URL_FINISH_WAIT,
|
||||
E_PROCESS_COMMENT_RUN,
|
||||
E_PROCESS_COMMENT_FINISH_WAIT,
|
||||
};
|
||||
public:
|
||||
SNaverBlogManage(QObject *pObject);
|
||||
void MakeTables();
|
||||
void DropTables();
|
||||
void SaveCsv(QString _strName);
|
||||
void Join();
|
||||
int GetTableNumber() {return m_nUrlTable;}
|
||||
private:
|
||||
QString makeGetListQuery(QString _str,QDate _date);
|
||||
QString makeGetCommentQuery(QString _strUrl);
|
||||
private:
|
||||
QString m_strQuery;
|
||||
QVector <QString> m_strListURL;
|
||||
const QString C_TABLE_URL;
|
||||
const QString C_TABLE_COM;
|
||||
int m_nUrlTable;
|
||||
protected:
|
||||
bool Update();
|
||||
void Start();
|
||||
void processFinished(QProcess *pPro,QString _strOut);
|
||||
};
|
||||
|
||||
#endif // SNAVERBLOGMANAGE_H
|
||||
Reference in New Issue
Block a user