새로운 크롤러

git-svn-id: svn://192.168.0.12/source@19 8346c931-da38-4b9b-9d4c-e48b93cbd075
This commit is contained in:
admin
2015-02-03 08:19:33 +00:00
parent 035ccf60d7
commit 53dd9275da
7 changed files with 97 additions and 178 deletions

View File

@@ -4,7 +4,7 @@
#
#-------------------------------------------------
QT += core gui sql
QT += core gui sql network
greaterThan(QT_MAJOR_VERSION, 4): QT += widgets
@@ -13,6 +13,11 @@ TEMPLATE = app
SOURCES += main.cpp\
widget.cpp
widget.cpp\
smanage.cpp \
snavercafemanage.cpp
HEADERS += widget.h \
smanage.h \
snavercafemanage.h
HEADERS += widget.h

View File

@@ -8,17 +8,34 @@ SManage::SManage(QObject *parent) :
connect(&m_pro[i],SIGNAL(finished(int,QProcess::ExitStatus)),SLOT(processFinished(int,QProcess::ExitStatus)));
}
void SManage::Start(QDate _StartDate,QDate _EndDate,QString _strKeyword,QString _strAuthorship,QString _strKeywordID,QString _strGroupID,int _nStart,int _nTime)
void SManage::Start(QDate _StartDate,QDate _EndDate,QString _strKeyword,QString _strAuthorship,QString _strKeywordID,QString _strGroupID,int _nStart,QString _strTime)
{
m_date = _StartDate;
m_dateEnd = _EndDate;
m_strKeyword = _strKeyword;
m_bFinalLast = false;
m_nTime = _nTime;
m_bFinalLast = false;
m_strKeywordID = _strKeywordID;
m_strAuthorship = _strAuthorship;
m_ncList = _nStart;
m_strGroupID = _strGroupID;
QStringList strList = _strTime.split("~");
switch(strList.size())
{
case 0:
m_nStartTime = 3;
m_nRangeTime = 1;
break;
case 1:
m_nStartTime = strList.at(0).toInt();
m_nRangeTime = 1;
break;
case 2:
m_nStartTime = strList.at(0).toInt();
m_nRangeTime = strList.at(1).toInt() - strList.at(0).toInt();
break;
}
m_timeEnd = QDateTime::currentDateTime();
m_timeEnd = m_timeEnd.addSecs(rand() % m_nRangeTime + m_nStartTime);
Start();
}
@@ -71,17 +88,16 @@ bool SManage::UseProcess()
void SManage::processFinished(int exitCode,QProcess::ExitStatus exitStatus)
{
SProcess *pPro = (SProcess*)sender();
QProcess *pPro = (QProcess*)sender();
QString str = pPro->readAllStandardOutput();
QStringList list = str.split("\n");
foreach(QString log,list)
{
if (m_pMain)
m_pMain->InsertLog(m_nID,log);
m_pMain->InsertLog(log);
else
exit(0);
}
pPro->SetState(SProcess::STATE_WAIT);
processFinished(pPro,str);
pPro->kill();
}
@@ -106,10 +122,20 @@ void SManage::WaitExitProcess()
{
for(int i = 0; i < C_PROCESS_MAX ; i++)
{
if (m_pro[i].State() != SProcess::STATE_WAIT)
if (m_pro[i].state() == QProcess::Running)
m_pro[i].kill();
}
bQuit = UseProcess();
}
}
bool SManage::CheckTime()
{
if (QDateTime::currentDateTime() > m_timeEnd)
{
m_timeEnd = QDateTime::currentDateTime();
m_timeEnd = m_timeEnd.addSecs(rand() % m_nRangeTime + m_nStartTime);
return true;
}
return false;
}

View File

@@ -4,50 +4,52 @@
#include <QObject>
#include <QSqlDatabase>
#include <QDate>
#include <QDateTime>
#include <QStringList>
#include <QVector>
#include "SProcess.h"
#include <QProcess>
class Widget;
class SManage : public QObject
{
Q_OBJECT
private:
QVector <QStringList> m_vecList;
private slots:
void processFinished(int exitCode, QProcess::ExitStatus exitStatus);
public:
explicit SManage(QObject *parent = 0);
public:
void Start(QDate _StartDate,QDate _EndDate,QString _strKeyword,QString _strAuthorship,QString _strKeywordID,QString _strGroupID,int _nStart,int _nTime);
void Start(QDate _StartDate,QDate _EndDate,QString _strKeyword,QString _strAuthorship,QString _strKeywordID,QString _strGroupID,int _nStart,QString _strTime);
void SetParent(Widget *pWidget);
void WaitExitProcess();
virtual bool Update() = 0;
signals:
public slots:
protected:
virtual void Start() = 0;
QString EncodetoUtf8(QString _str,bool _bExt=false);
virtual void processFinished(SProcess *pPro,QString _strOut) = 0;
virtual void processFinished(QProcess *pPro,QString _strOut) = 0;
bool UseProcess();
void CheckLast();
bool CheckTime();
protected:
Widget *m_pMain;
QDate m_date,m_dateEnd;
QDateTime m_timeEnd;
int m_nMode;
QString m_strKeyword;
QString m_strKeywordID;
QString m_strGroupID;
QString m_strAuthorship;
bool m_bFinalLast;
int m_nTime;
bool m_bFinalLast;
int m_nID;
bool m_bLast;
int m_ncList;
int m_ncUrl;
int m_nWait;
int m_nStartTime;
int m_nRangeTime;
static const int C_PROCESS_MAX = 1;
SProcess m_pro[C_PROCESS_MAX];
private:
QVector <QStringList> m_vecList;
private slots:
void processFinished(int exitCode, QProcess::ExitStatus exitStatus);
QProcess m_pro[C_PROCESS_MAX];
};
#endif // SMANAGE_H

View File

@@ -4,6 +4,7 @@
#include <QSqlError>
#include <qDebug>
#include <QFile>
#include <QNetworkProxy>
SNaverCafeManage::SNaverCafeManage(QObject *pObject) : SManage(pObject)
{
@@ -45,13 +46,12 @@ bool SNaverCafeManage::Update()
switch(m_nMode)
{
case E_PROCESS_LIST_RUN:
if (UseProcess() == false)
if (UseProcess() == false && CheckTime())
{
m_strListQuery = makeGetListQuery(m_strKeyword,m_date,m_ncList);
m_pMain->InsertLog(m_nID,"Start : " + QString::number(m_ncList) + " Date : " + m_date.toString("yyyy-MM-dd"));
m_pMain->InsertLog("Start : " + QString::number(m_ncList) + " Date : " + m_date.toString("yyyy-MM-dd"));
{
m_pro[0].start("CrawlerProcess",QStringList()<< "naver" << "cafe_list" << m_strListQuery << m_strGroupID << m_strKeywordID);
m_pro[0].SetState(SProcess::STATE_RUNNING);
m_pro[0].start("CrawlerProcess",QStringList()<< "naver" << "cafe_list" << m_strListQuery << m_strGroupID << m_strKeywordID);
m_ncList+=10;
}
m_nMode = E_PROCESS_LIST_FINISH_WAIT;
@@ -59,12 +59,11 @@ bool SNaverCafeManage::Update()
}
break;
case E_PROCESS_URL_RUN:
if (UseProcess() == false)
if (UseProcess() == false && CheckTime())
{
m_pMain->InsertLog(m_nID,"(" + QString::number(m_ncUrl+1) + "/" + QString::number(m_strListURL.size()) + ")");
m_pMain->InsertLog("(" + QString::number(m_ncUrl+1) + "/" + QString::number(m_strListURL.size()) + ")");
{
m_pro[0].start("CrawlerProcess",QStringList() << "naver" << "cafe_data" << m_strListURL.at(m_ncUrl) << m_strGroupID << m_strListQuery << "" );
m_pro[0].SetState(SProcess::STATE_RUNNING);
m_pro[0].start("CrawlerProcess",QStringList() << "naver" << "cafe_data" << m_strListURL.at(m_ncUrl) << m_strGroupID << m_strListQuery << "" );
m_ncUrl++;
}
m_nMode = E_PROCESS_URL_FINISH_WAIT;
@@ -74,15 +73,14 @@ bool SNaverCafeManage::Update()
case E_PROCESS_LIST_FINISH_WAIT:
case E_PROCESS_URL_FINISH_WAIT:
m_nWait++;
if (m_nWait > (100000/m_nTime))
if (m_nWait >= 60)
{
//for(int i = 0; i < C_PROCESS_MAX ; i++)
{
if (m_pro[0].State() != SProcess::STATE_WAIT)
if (m_pro[0].state() == QProcess::Running)
{
m_pro[0].kill();
m_pro[0].SetState(SProcess::STATE_WAIT);
m_pMain->InsertLog(m_nID,"Kill Process.");
m_pMain->InsertLog("Kill Process.");
}
}
if (m_nMode == E_PROCESS_LIST_FINISH_WAIT) return m_bFinalLast;
@@ -92,7 +90,7 @@ bool SNaverCafeManage::Update()
return m_bFinalLast;
}
void SNaverCafeManage::processFinished(SProcess *_pPro,QString _strOut)
void SNaverCafeManage::processFinished(QProcess *_pPro,QString _strOut)
{
switch(m_nMode)
{
@@ -101,6 +99,10 @@ void SNaverCafeManage::processFinished(SProcess *_pPro,QString _strOut)
m_bLast = false;
if (_strOut.right(4) == "last" || m_ncList >= 1000)
m_bLast = true;
if (_strOut.right(5) == "block")
m_bFinalLast = true;
m_strListURL.clear();
foreach(QString str,_strOut.split("\n"))
{
@@ -108,16 +110,6 @@ void SNaverCafeManage::processFinished(SProcess *_pPro,QString _strOut)
if (str.at(0) == QChar('o'))
m_strListURL.push_back(str.right(str.length()-2).trimmed());
}
/*
QSqlQuery query;
if (query.exec("SELECT URL FROM " + C_TABLE_URL + QString::number(m_nUrlTable) + " where ERROR is null"))
{
m_pMain->InsertLog(m_nID,query.lastError().text());
}
while (query.next())
m_strListURL.append(query.value(0).toString());
*/
m_ncUrl = 0;
if (m_strListURL.size() == 0)
{
@@ -141,105 +133,3 @@ void SNaverCafeManage::processFinished(SProcess *_pPro,QString _strOut)
break;
}
}
/*
void SNaverCafeManage::MakeTables()
{
QString strQuery = "show tables";
QSqlQuery query;
query.exec(strQuery);
int nUrlMax = -1;
while (query.next())
{
QString str = query.value(0).toString();
if (str.left(C_TABLE_URL.size()) == C_TABLE_URL.toUpper())
{
if (nUrlMax < str.mid(C_TABLE_URL.size()).toInt())
nUrlMax = str.mid(C_TABLE_URL.size()).toInt();
}
}
m_nUrlTable = nUrlMax + 1;
strQuery = "Create table " + C_TABLE_URL + QString::number(m_nUrlTable)+ "(Url CHAR(128) not null primary key,keyword_id INT,PlatformTitle CHAR(128),PlatformID CHAR(64),ArticleTitle VARCHAR(128),ArticleID CHAR(32),Date DATETIME,Nickname CHAR(32),Data VARCHAR(18432),Error CHAR(32)) CHARSET=utf8";
query.exec(strQuery);
strQuery = "Create table " + C_TABLE_COM + QString::number(m_nUrlTable)+ "(Url CHAR(128) not null,Nickname CHAR(32),Data VARCHAR(1024),Parent CHAR(64),Date DATETIME,UrlReply VARCHAR(512),RowNum INT) CHARSET=utf8";
query.exec(strQuery);
m_pMain->setWindowTitle("NaverCafeCrawler " + QString::number(m_nUrlTable));
}
void SNaverCafeManage::DropTables()
{
QString strQuery = "drop table ";
QSqlQuery query;
query.exec(strQuery + C_TABLE_URL + QString::number(m_nUrlTable));
query.exec(strQuery + C_TABLE_COM + QString::number(m_nUrlTable));
}
void SNaverCafeManage::Join()
{
m_pMain->InsertLog(m_nID,"Insert Article Data...");
QString strQuery = "insert into "
"data_" + m_strGroupID +
"(platformname , platformform , articleform ,"
"url , keyword_id , body_platformtitle , body_platformid , body_articletitle , body_articleid , body_date , body_nickname , body_data)"
"select "
"CONVERT('naver' USING utf8),"
"CONVERT('cafe' USING utf8),"
"CONVERT('article' USING utf8),"
"CONVERT(url USING utf8),"
"CONVERT(keyword_id USING utf8),"
"CONVERT(PlatformTitle USING utf8),"
"CONVERT(PlatformID USING utf8),"
"CONVERT(ArticleTitle USING utf8),"
"CONVERT(ArticleID USING utf8),"
"CONVERT(Date USING utf8),"
"CONVERT(Nickname USING utf8),"
"CONVERT(Data USING utf8)"
"from NAVER_CAFE_BODY_" + QString::number(m_nUrlTable);
QSqlQuery query;
if (query.exec(strQuery) == false)
{
m_pMain->InsertLog(m_nID,query.lastError().text());
return;
}
m_pMain->InsertLog(m_nID,"Insert Reply Data...");
strQuery = "insert into "
"data_" + m_strGroupID +
"(platformname , platformform , articleform ,"
"url , keyword_id , body_platformtitle , body_platformid , body_articletitle , body_articleid , body_date , body_nickname , body_data ,"
"reply_nickname ,reply_data, reply_parent , reply_date ,reply_urlreply ,reply_rownum )"
"select "
"CONVERT('naver' USING utf8),"
"CONVERT('cafe' USING utf8),"
"CONVERT('reply' USING utf8),"
"CONVERT(_body.url USING utf8),"
"CONVERT(_body.keyword_id USING utf8),"
"CONVERT(_body.PlatformTitle USING utf8),"
"CONVERT(_body.PlatformID USING utf8),"
"CONVERT(_body.ArticleTitle USING utf8),"
"CONVERT(_body.ArticleID USING utf8),"
"CONVERT(_body.Date USING utf8),"
"CONVERT(_body.Nickname USING utf8),"
"CONVERT(_body.Data USING utf8),"
"CONVERT(_reply.Nickname USING utf8),"
"CONVERT(_reply.Data USING utf8),"
"CONVERT(_reply.Parent USING utf8),"
"CONVERT(_reply.Date USING utf8),"
"CONVERT(_reply.UrlReply USING utf8),"
"CONVERT(_reply.RowNum USING utf8) "
"from NAVER_CAFE_BODY_" + QString::number(m_nUrlTable) + " _body INNER JOIN NAVER_CAFE_REPLY_" + QString::number(m_nUrlTable) + " _reply ON _body.Url = _reply.Url";
query.exec(strQuery);
if (query.exec(strQuery) == false)
{
m_pMain->InsertLog(m_nID,query.lastError().text());
return;
}
m_pMain->InsertLog(m_nID,"Delete data ...");
query.exec("delete from NAVER_CAFE_BODY_" + QString::number(m_nUrlTable) );
query.exec("delete from NAVER_CAFE_REPLY_" + QString::number(m_nUrlTable) );
m_pMain->InsertLog(m_nID,"Finish ... ");
}
*/

View File

@@ -17,12 +17,11 @@ private:
QString makeGetListQuery(QString _str,QDate _date,int _nPage);
private:
QString m_strListQuery;
QVector <QString> m_strListURL;
int m_nUrlTable;
QVector <QString> m_strListURL;
protected:
bool Update();
void Start();
void processFinished(SProcess *pPro,QString _strOut);
void processFinished(QProcess *pPro,QString _strOut);
};
#endif // SNAVERCAFEMANAGE_H

View File

@@ -21,7 +21,7 @@ Widget::Widget(QWidget *parent) : QWidget(parent)
QPushButton *pbtStop = new QPushButton("Stop",this);
{
QObject::connect(pbtStart,SIGNAL(clicked()),this,SLOT(StartButton()));
QObject::connect(pbtStop,SIGNAL(clicked()),this,SLOT(StartButton()));
QObject::connect(pbtStop,SIGNAL(clicked()),this,SLOT(StopButton()));
QObject::connect(pbtRefresh,SIGNAL(clicked()),this,SLOT(RefreshButton()));
}
@@ -34,8 +34,7 @@ Widget::Widget(QWidget *parent) : QWidget(parent)
}
m_pedTime = new QLineEdit(this);
m_pedTime->setText(QString("500"));
m_pedTime->setText(QString("3~10"));
{
QHBoxLayout *hlayout = new QHBoxLayout;
hlayout->addWidget(m_pedTime);
@@ -45,9 +44,9 @@ Widget::Widget(QWidget *parent) : QWidget(parent)
}
{
QHBoxLayout *hlayout = new QHBoxLayout;
for (int i = 0; i < C_CRAWLER_MAX; i++ )
hlayout->addWidget(&m_aResultList[i]);
QHBoxLayout *hlayout = new QHBoxLayout;
m_pResultList = new QListWidget;
hlayout->addWidget(m_pResultList);
vlayout->addLayout(hlayout);
}
@@ -62,7 +61,7 @@ Widget::Widget(QWidget *parent) : QWidget(parent)
m_db.setDatabaseName("concepters");
if (!m_db.open())
{
InsertLog(0,"MySql Error...");
InsertLog("MySql Error...");
return;
}
@@ -78,16 +77,14 @@ Widget::~Widget()
m_pNaverCafe->SetParent(0);
}
void Widget::InsertLog(int _nSelect,QString str)
void Widget::InsertLog(QString str)
{
if (_nSelect >= C_CRAWLER_MAX) return;
QTime time = QTime::currentTime();
QString strOut = time.toString("[hh:mm:ss] ") + str;
m_aResultList[_nSelect].addItem(strOut);
m_pResultList->addItem(strOut);
QDate date = QDate::currentDate();
QFile file(date.toString(Qt::ISODate)+"_"+QString::number(_nSelect)+".log");
QFile file(date.toString(Qt::ISODate)+".log");
if (!file.open(QIODevice::WriteOnly | QIODevice::Text | QIODevice::Append))
return;
@@ -95,22 +92,22 @@ void Widget::InsertLog(int _nSelect,QString str)
out << strOut << "\n";
file.close();
if (m_aResultList[_nSelect].count() > 1024)
if (m_pResultList->count() > 1024)
{
m_aResultList[_nSelect].removeItemWidget(m_aResultList[_nSelect].item(0));
QListWidgetItem* item = m_aResultList[_nSelect].takeItem(0);
m_pResultList->removeItemWidget(m_pResultList->item(0));
QListWidgetItem* item = m_pResultList->takeItem(0);
delete item;
}
m_aResultList[_nSelect].setCurrentRow( m_aResultList[_nSelect].count() - 1 );
m_aResultList[_nSelect].repaint();
m_pResultList->setCurrentRow( m_pResultList->count() - 1 );
m_pResultList->repaint();
}
void Widget::StartButton()
{
{
m_timer.stop();
m_timer.start(m_pedTime->text().trimmed().toInt());
m_timer.start(1000);
QSqlQuery query;
QSqlQuery query;
query.exec("UPDATE crawling set state = '" + QString("run") + "' where id = '" + m_pcb->currentData().toString() + "'");
query.exec("SELECT _keyword.start,_keyword.end, _keyword.searches,_keyword.authorship,_keyword.id,_datagroup.id "
"FROM crawling _crawling INNER JOIN keyword _keyword ON _crawling.keyword_id = _keyword.id "
@@ -128,7 +125,7 @@ void Widget::StartButton()
query.value(4).toString().trimmed(),// keyword_id
query.value(5).toString().trimmed(),
1,
m_pedTime->text().trimmed().toInt());
m_pedTime->text().trimmed());
}
void Widget::StopButton()
@@ -142,18 +139,18 @@ void Widget::Update()
{
if (m_db.open())
{
InsertLog(0,"MySql Open Error...");
InsertLog("MySql Open Error...");
m_timer.stop();
return;
}
}
int nCount = 0;
for (int i = 0 ; i < C_CRAWLER_MAX ; i++)
nCount += m_pManage[i]->Update();
nCount += m_pManage[0]->Update();
if (nCount == C_CRAWLER_MAX)
{
InsertLog(0,"Finish...");
m_timer.stop();
InsertLog("Finish...");
m_timer.stop();
}
}

View File

@@ -21,19 +21,19 @@ public:
Widget(QWidget *parent = 0);
~Widget();
public:
void InsertLog(int _nSelect,QString str);
void InsertLog(QString str);
private:
QLineEdit *m_pedTime;
QTimer m_timer;
QSqlDatabase m_db;
static const int C_CRAWLER_MAX = 1;
SManage *m_pManage[C_CRAWLER_MAX];
QListWidget m_aResultList[C_CRAWLER_MAX];
QListWidget *m_pResultList;
QString m_strFileName;
QComboBox *m_pcb;
QVector <QString> m_vecSelect;
SNaverCafeManage *m_pNaverCafe;
int m_nStartTime,m_nRangeTime;
private:
QString makeCafeGetListQuery(QString _str,QDate _date,int _nPage);
private slots: