다음크롤러추가

네이버블로그크롤러 카페형식에 맞게 수정


git-svn-id: svn://192.168.0.12/source@46 8346c931-da38-4b9b-9d4c-e48b93cbd075
This commit is contained in:
admin
2015-03-13 02:20:08 +00:00
parent 2069613a9a
commit 57b4fe61c8
9 changed files with 275 additions and 94 deletions

View File

@@ -16,10 +16,12 @@ SOURCES += main.cpp\
widget.cpp\
smanage.cpp \
snavercafemanage.cpp \
snaverblogmanage.cpp
snaverblogmanage.cpp \
sdaumcafemanage.cpp
HEADERS += widget.h \
smanage.h \
snavercafemanage.h \
snaverblogmanage.h
snaverblogmanage.h \
sdaumcafemanage.h

View File

@@ -0,0 +1,154 @@
#include "SDaumcafemanage.h"
#include "widget.h"
#include <QSqlQuery>
#include <QSqlError>
#include <qDebug>
#include <QFile>
#include <QNetworkProxy>
SDaumCafeManage::SDaumCafeManage(QObject *pObject) : SManage(pObject)
{
m_nID = 0;
}
QString SDaumCafeManage::makeGetListQuery(QString _str,QDate _date,int _nPage)
{
/*
QString strOut;
strOut = "http://cafeblog.search.naver.com/search.naver?where=article&ie=utf8&query=";
strOut += EncodetoUtf8(_str,true);
strOut += "&t=0&st=date&date_option=6&date_from=";
strOut += _date.toString("yyyy.MM.dd");
strOut += "&date_to=";
strOut += _date.toString("yyyy.MM.dd");
//strOut += "&srchby=text&dup_remove=1&cafe_url=&without_cafe_url=&board=&sm=tab_pge&nso=so%3Add%2Cp%3Afrom";
strOut += "&srchby=text&dup_remove=1&";
strOut += "cafe_url=";
strOut += m_strAuthorship;
strOut += "&without_cafe_url=&board=&sm=tab_pge&nso=so:dd,p:from";
strOut += _date.toString("yyyyMMdd");
strOut += "to";
strOut += _date.toString("yyyyMMdd");
//strOut += "%2Ca%3Aall&start=" + QString::number(_nPage);
strOut += ",a:all&start=" + QString::number(_nPage);
//qDebug() << strOut;
*/
QString strOut;
strOut = "http://search.daum.net/search?nil_suggest=btn&w=cafe&lpp=10&q=";
strOut += EncodetoUtf8(_str,true);
strOut += "&period=u&p=";
strOut += QString::number(_nPage);
strOut += "&sd=";
strOut += _date.toString("yyyyMMdd");
strOut += "000000";
strOut += "&ed=";
strOut += _date.toString("yyyyMMdd");
strOut += "235959";
strOut += "&page=1&DA=PGD&m=board";
//InsertLog("DaumCafe : URL");
//InsertLog(strOut);
qDebug() << strOut;
return strOut;
}
void SDaumCafeManage::Start()
{
m_nMode = E_PROCESS_LIST_RUN;
m_bFinalLast = false;
}
bool SDaumCafeManage::Update()
{
if (m_bFinalLast) return m_bFinalLast;
switch(m_nMode)
{
case E_PROCESS_LIST_RUN:
if (UseProcess() == false && CheckTime())
{
m_strListQuery = makeGetListQuery(m_strKeyword,m_date,m_ncList);
m_pMain->InsertLog("Start : " + QString::number(m_ncList) + " Date : " + m_date.toString("yyyy-MM-dd"));
{
m_pro[0].start("CrawlerProcess",QStringList()<< "daum" << "cafe_list" << m_strListQuery << m_strGroupID << m_strKeywordID);
m_ncList+=1;
}
m_nMode = E_PROCESS_LIST_FINISH_WAIT;
m_nWait = 0;
}
break;
case E_PROCESS_URL_RUN:
if (UseProcess() == false && CheckTime())
{
m_pMain->InsertLog("(" + QString::number(m_ncUrl+1) + "/" + QString::number(m_strListURL.size()) + ")");
{
m_pro[0].start("CrawlerProcess",QStringList() << "daum" << "cafe_data" << m_strListURL.at(m_ncUrl) << m_strGroupID << m_strListQuery << "" );
m_ncUrl++;
}
m_nMode = E_PROCESS_URL_FINISH_WAIT;
m_nWait = 0;
}
break;
case E_PROCESS_LIST_FINISH_WAIT:
case E_PROCESS_URL_FINISH_WAIT:
m_nWait++;
if (m_nWait >= 60)
{
//for(int i = 0; i < C_PROCESS_MAX ; i++)
{
// if (m_pro[0].state() == QProcess::Running)
{
m_pMain->InsertLog("Kill Process.");
m_pro[0].kill();
}
}
if (m_nMode == E_PROCESS_LIST_FINISH_WAIT) return m_bFinalLast;
}
break;
}
return m_bFinalLast;
}
void SDaumCafeManage::processFinished(QProcess *_pPro,QString _strOut)
{
switch(m_nMode)
{
case E_PROCESS_LIST_FINISH_WAIT:
{
m_bLast = false;
if (_strOut.right(4) == "last" || m_ncList >= 1000)
m_bLast = true;
if (_strOut.right(5) == "block")
m_bFinalLast = true;
m_strListURL.clear();
foreach(QString str,_strOut.split("\n"))
{
if (str.isEmpty()) continue;
if (str.at(0) == QChar('o'))
m_strListURL.push_back(str.right(str.length()-2).trimmed());
}
m_ncUrl = 0;
if (m_strListURL.size() == 0)
{
m_nMode = E_PROCESS_LIST_RUN;
CheckLast();
}
else
m_nMode = E_PROCESS_URL_RUN;
break;
}
case E_PROCESS_URL_FINISH_WAIT:
m_nMode = E_PROCESS_LIST_RUN;
if (m_ncUrl >= m_strListURL.size())
{
m_nMode = E_PROCESS_LIST_RUN;
CheckLast();
m_bLast = false;
}
else
m_nMode = E_PROCESS_URL_RUN;
break;
}
}

View File

@@ -0,0 +1,31 @@
#ifndef SDAUMCAFEMANAGE_H
#define SDAUMCAFEMANAGE_H
#include "SManage.h"
class SDaumCafeManage : public SManage
{
public:
enum E_PROCESS_STATE
{
E_PROCESS_LIST_RUN = 0,
E_PROCESS_LIST_FINISH_WAIT,
E_PROCESS_URL_RUN,
E_PROCESS_URL_FINISH_WAIT,
};
SDaumCafeManage(QObject *pObject);
private:
QString makeGetListQuery(QString _str,QDate _date,int _nPage);
private:
QString m_strListQuery;
QVector <QString> m_strListURL;
protected:
bool Update();
void Start();
void processFinished(QProcess *pPro,QString _strOut);
};
#endif // SNAVERCAFEMANAGE_H

View File

@@ -8,6 +8,11 @@ SManage::SManage(QObject *parent) :
connect(&m_pro[i],SIGNAL(finished(int,QProcess::ExitStatus)),SLOT(processFinished(int,QProcess::ExitStatus)));
}
SManage::~SManage()
{
m_pMain = 0;
}
void SManage::Start(QDate _StartDate,QDate _EndDate,QString _strKeyword,QString _strAuthorship,QString _strKeywordID,QString _strGroupID,int _nStart,QString _strTime)
{
m_date = _StartDate;

View File

@@ -19,6 +19,7 @@ private slots:
void processFinished(int exitCode, QProcess::ExitStatus exitStatus);
public:
explicit SManage(QObject *parent = 0);
~SManage();
public:
void Start(QDate _StartDate,QDate _EndDate,QString _strKeyword,QString _strAuthorship,QString _strKeywordID,QString _strGroupID,int _nStart,QString _strTime);
void SetParent(Widget *pWidget);

View File

@@ -14,9 +14,15 @@ SNaverBlogManage::SNaverBlogManage(QObject *pObject) : SManage(pObject) , C_TABL
QString SNaverBlogManage::makeGetListQuery(QString _str,QDate _date)
{
//http://cafeblog.search.naver.com/search.naver?where=post&query=%EC%84%B1%ED%98%95&ie=utf8&st=date&sm=tab_opt&date_from=20140101&date_to=20150311&date_option=6&srchby=all&dup_remove=1&post_blogurl=&post_blogurl_without=&nso=so%3Add%2Ca%3Aall%2Cp%3Afrom20140101to20150311&mson=0
//http://cafeblog.search.naver.com/search.naver?where=post&sm=tab_pge&query=%EC%84%B1%ED%98%95&st=date&date_option=6&date_from=20140101&date_to=20140101&dup_remove=1&post_blogurl=&post_blogurl_without=&srchby=all&nso=so%3Add%2Cp%3Afrom20140101to20140101&ie=utf8&start=31
QString str;
QString strDate = _date.toString("yyyyMMdd");
//http://cafeblog.search.naver.com/search.naver?where=post&sm=tab_pge&query=%EC%95%84%EC%9D%B4%ED%8F%B0&st=date&date_option=6&date_from=20131103&date_to=20131103&dup_remove=1&post_blogurl=&post_blogurl_without=&srchby=all&nso=so%3Add%2Cp%3Afrom20131103to20131103&ie=utf8&start=11
/*
str = "http://cafeblog.search.naver.com/search.naver?where=post&sm=tab_pge&query=";
str += EncodetoUtf8(_str,true);
//str += "&st=date&date_option=6&date_from=" + strDate + "&date_to=" + strDate ;
@@ -27,6 +33,22 @@ QString SNaverBlogManage::makeGetListQuery(QString _str,QDate _date)
str += "&dup_remove=1&post_blogurl=&post_blogurl_without=&srchby=all&nso=so%3Add%2Cp%3Afrom";
str += strDate + "to" + strDate +"&ie=utf8&start=";
str += QString::number(m_ncList);
*/
str = "http://cafeblog.search.naver.com/search.naver?where=post&sm=tab_pge&query=";
str += EncodetoUtf8(_str,true);
str += "&st=date&date_option=6&date_from=";
str += strDate;
str += "&date_to=";
str += strDate;
str += "&dup_remove=1&post_blogurl=";
//if(m_strAuthorship.length() > 0)
// str += "blog.naver.com%2F";
str += m_strAuthorship;
str += "&post_blogurl_without=&srchby=all&nso=so%3Add%2Cp%3Afrom";
str += strDate + "to" + strDate + "&ie=utf8&start=";
str += QString::number(m_ncList);
return str;
}
@@ -34,12 +56,28 @@ QString SNaverBlogManage::makeGetCommentQuery(QString _strUrl)
{
//http://blog.naver.com/kohaku3533/220149821481/CommentList.nhn?blogId=kohaku3533&logNo=220149821481&currentPage=&isMemolog=false&focusingCommentNo=&showLastPage=true&shortestContentAreaWidth=false;
QStringList strList = _strUrl.split("/");
QString strOut = _strUrl;
strOut += "/CommentList.nhn?blogId=";
strOut += strList.at(3);
strOut += "&logNo=";
strOut += strList.at(4);
strOut += "&currentPage=&isMemolog=false&focusingCommentNo=&showLastPage=true&shortestContentAreaWidth=false";
QString strOut = "";
if(strList.at(2).compare("blog.naver.com") == 0)
{
strOut = _strUrl;
strOut += "/CommentList.nhn?blogId=";
strOut += strList.at(3);
strOut += "&logNo=";
strOut += strList.at(4);
strOut += "&currentPage=&isMemolog=false&focusingCommentNo=&showLastPage=true&shortestContentAreaWidth=false";
}
else //id.blog.me
{
strOut = "http://blog.naver.com/";
strOut += strList.at(2).split(".").at(0);
strOut += "/";
strOut += strList.at(3);
strOut += "/CommentList.nhn?blogId=";
strOut += strList.at(2).split(".").at(0);
strOut += "&logNo=";
strOut += strList.at(3);
strOut += "&currentPage=&isMemolog=false&focusingCommentNo=&showLastPage=true&shortestContentAreaWidth=false";
}
return strOut;
}
@@ -66,16 +104,6 @@ void SNaverBlogManage::processFinished(QProcess *pPro,QString _strOut)
if (str.at(0) == QChar('o'))
m_strListURL.push_back(str.right(str.length()-2).trimmed());
}
/*
QSqlQuery query;
if(query.exec("SELECT ARTICLE_URL FROM " + C_TABLE_URL + QString::number(m_nUrlTable) + " where ERROR is null"))
{
m_pMain->InsertLog(m_nID,query.lastError().text());
}
while (query.next())
m_strListURL.append(query.value(0).toString());
*/
m_ncUrl = 0;
if (m_strListURL.size() == 0)
{
@@ -93,7 +121,6 @@ void SNaverBlogManage::processFinished(QProcess *pPro,QString _strOut)
}
break;
case E_PROCESS_COMMENT_FINISH_WAIT:
//if (UseProcess() == false)
if (m_ncUrl >= m_strListURL.size())
{
m_nMode = E_PROCESS_LIST_RUN;
@@ -108,8 +135,8 @@ void SNaverBlogManage::processFinished(QProcess *pPro,QString _strOut)
bool SNaverBlogManage::Update()
{
//m_pMain->InsertLog("Blog Update start");
if (m_bFinalLast) return m_bFinalLast;
switch(m_nMode)
{
case E_PROCESS_LIST_RUN:
@@ -118,13 +145,8 @@ bool SNaverBlogManage::Update()
m_strQuery = makeGetListQuery(m_strKeyword,m_date);
//m_pMain->InsertLog(m_nID,"Start : " + QString::number(m_ncList) + " Date : " + m_date.toString("yyyy-MM-dd"));
m_pMain->InsertLog("Start : " + QString::number(m_ncList) + " Date : " + m_date.toString("yyyy-MM-dd"));
m_pMain->InsertLog(m_strQuery);
//for(int i = 0; i < C_PROCESS_MAX ; i++)
{
m_pro[0].start("CrawlerProcess",QStringList()<< "naver" << "blog_list" << m_strQuery << m_strGroupID << m_strKeywordID);
// m_pro[0].SetState(SProcess::STATE_RUNNING);
m_pMain->InsertLog("m_pro[0] is started");
m_ncList+=10;
}
m_nMode = E_PROCESS_LIST_FINISH_WAIT;
@@ -134,25 +156,8 @@ bool SNaverBlogManage::Update()
case E_PROCESS_URL_RUN:
if (UseProcess() == false && CheckTime())
{
/*
for(int i = 0; i < C_PROCESS_MAX ; i++)
{
m_pro[0].start("CrawlerProcess",QStringList() << m_strListURL.at(m_ncUrl) << "url");
}
int i = 0;
foreach(QString strUrl,m_strListURL)
//for(int i = 0; i < C_PROCESS_MAX ; i++)
{
m_pro[i++].start("CrawlerProcess",QStringList() << strUrl << "url" << C_TABLE_URL + QString::number(m_nUrlTable));
if (i >= C_PROCESS_MAX) break;
}
*/
//m_pro[0].start("CrawlerProcess",QStringList() << m_strListURL.at(m_ncUrl) << "blog_url" << C_TABLE_URL + QString::number(m_nUrlTable));
//m_pMain->InsertLog(m_nID,"(" + QString::number(m_ncUrl+1) + "/" + QString::number(m_strListURL.size()) + ")");
m_pMain->InsertLog("(" + QString::number(m_ncUrl+1) + "/" + QString::number(m_strListURL.size()) + ")");
m_pro[0].start("CrawlerProcess",QStringList() << "naver" << "blog_url" << m_strListURL.at(m_ncUrl) << m_strGroupID << "" );
// m_pro[0].SetState(SProcess::STATE_RUNNING);
m_nMode = E_PROCESS_URL_FINISH_WAIT;
m_nWait = 0;
}
@@ -160,17 +165,7 @@ bool SNaverBlogManage::Update()
case E_PROCESS_COMMENT_RUN:
if (UseProcess() == false && CheckTime())
{
//int i=0;
//foreach(QString strUrl,m_strListURL)
{
//m_pro[i++].start("CrawlerProcess",QStringList() << makeGetCommentQuery(strUrl)<< "comment" << C_TABLE_COM + QString::number(m_nUrlTable));
//if (i >= C_PROCESS_MAX) break;
//m_ncUrl++;
}
//m_pMain->InsertLog(m_nID,"(" + QString::number(m_ncUrl+1) + "/" + QString::number(m_strListURL.size()) + ")");
m_pro[0].start("CrawlerProcess",QStringList() << "naver" << "blog_comm" << makeGetCommentQuery(m_strListURL.at(m_ncUrl++)) << m_strGroupID << "" );
//m_pro[0].start("CrawlerProcess",QStringList() << makeGetCommentQuery(m_strListURL.at(m_ncUrl++))<< "blog_comm" << C_TABLE_COM + QString::number(m_nUrlTable));
//m_pro[0].SetState(SProcess::STATE_RUNNING);
m_nMode = E_PROCESS_COMMENT_FINISH_WAIT;
m_nWait = 0;
}
@@ -181,30 +176,13 @@ bool SNaverBlogManage::Update()
m_nWait++;
if (m_nWait >=60)
{
//for(int i = 0; i < C_PROCESS_MAX ; i++)
{
{
{
m_pro[0].kill();
//m_pMain->InsertLog(m_nID,"Kill Process.");
m_pMain->InsertLog("Kill Process.");
m_pro[0].kill();
}
}
if (m_nMode == E_PROCESS_LIST_FINISH_WAIT) return m_bFinalLast;
/*
QString strQuery = "update blog set ";
strQuery += "Error ";
strQuery += "='";
strQuery += "Kill Process";
strQuery += "'";
strQuery += " where URL='";
if (m_nMode == E_PROCESS_COMMENT_FINISH_WAIT)
strQuery += m_strListURL.at(m_ncUrl-1);
else
strQuery += m_strListURL.at(m_ncUrl);
strQuery += "'";
QSqlQuery sql;
sql.exec(strQuery);
*/
}
break;
}

View File

@@ -43,6 +43,7 @@ void SNaverCafeManage::Start()
bool SNaverCafeManage::Update()
{
if (m_bFinalLast) return m_bFinalLast;
switch(m_nMode)
{
case E_PROCESS_LIST_RUN:
@@ -77,10 +78,10 @@ bool SNaverCafeManage::Update()
{
//for(int i = 0; i < C_PROCESS_MAX ; i++)
{
if (m_pro[0].state() == QProcess::Running)
// if (m_pro[0].state() == QProcess::Running)
{
m_pro[0].kill();
m_pMain->InsertLog("Kill Process.");
m_pro[0].kill();
}
}
if (m_nMode == E_PROCESS_LIST_FINISH_WAIT) return m_bFinalLast;

View File

@@ -6,9 +6,10 @@
#include <QFileDialog>
#include <QPushButton>
#include <QSqlQuery>
#include <QThread>
#include "snavercafemanage.h"
#include "snaverblogmanage.h"
#include "sdaumcafemanage.h"
Widget::Widget(QWidget *parent) : QWidget(parent)
{
connect(&m_timer, SIGNAL(timeout()), this, SLOT(Update()));
@@ -51,10 +52,14 @@ Widget::Widget(QWidget *parent) : QWidget(parent)
}
setLayout(vlayout);
m_pNaverCafe = new SNaverCafeManage(this);
m_pNaverBlog = new SNaverBlogManage(this);
m_pDaumCafe = new SDaumCafeManage(this);
m_pManage[0] = m_pNaverCafe;
m_pManage[1] = m_pNaverBlog;
m_pManage[2] = m_pDaumCafe;
m_db = QSqlDatabase::addDatabase("QMYSQL");
m_db.setHostName("bigbird.iptime.org");
@@ -149,24 +154,17 @@ void Widget::StartButton()
str += query.value(2).toString() + " -> ";
str += query.value(3).toString();
int m_nPlatform = 0;
if(query.value(6).toInt() == 0)
{
InsertLog("0");
m_nPlatform = 0;
}
else if(query.value(6).toInt() == 1)
{
InsertLog("1");
m_nPlatform = 1;
}
m_nPlatform = 0;
if(0 <= query.value(6).toInt() && query.value(6).toInt() < C_CRAWLER_MAX)
m_nPlatform = query.value(6).toInt();
else
{
InsertLog("fail");
InsertLog("Starting Crawler is failed");
return;
}
m_pManage[m_nPlatform]->Start(QDate::fromString(query.value(0).toString(),"yyyy-MM-dd"),
QDate::fromString(query.value(1).toString(),"yyyy-MM-dd"),
query.value(2).toString().trimmed(),// keyword
@@ -174,7 +172,7 @@ void Widget::StartButton()
query.value(4).toString().trimmed(),// keyword_id
query.value(5).toString().trimmed(),
1,
m_pedTime->text().trimmed());
m_pedTime->text().trimmed());
}
void Widget::StopButton()
@@ -184,6 +182,7 @@ void Widget::StopButton()
void Widget::Update()
{
if (m_db.isOpen() == false)
{
if (m_db.open())
@@ -193,14 +192,20 @@ void Widget::Update()
return;
}
}
int nCount = 0;
for (int i = 0 ; i < C_CRAWLER_MAX ; i++)
nCount += m_pManage[i]->Update();
//int nCount = 0;
//for (int i = 0 ; i < C_CRAWLER_MAX ; i++)
//nCount +=
m_pManage[m_nPlatform]->Update();
/*
if (nCount == C_CRAWLER_MAX)
{
InsertLog("Finish...");
m_timer.stop();
}
*/
}
void Widget::RefreshButton()
@@ -221,6 +226,8 @@ void Widget::RefreshButton()
str += ", NaverCafe";
else if(query.value(6).toString().toInt() == 1)
str += ", NaverBlog";
else if(query.value(6).toString().toInt() == 2)
str += ", DaumCafe";
m_pcb->addItem(str,query.value(5));
}
}

View File

@@ -11,6 +11,7 @@
class SNaverCafeManage;
class SNaverBlogManage;
class SDaumCafeManage;
class SManage;
#define SAFE_DELETE(p) {if(p) delete (p); (p) = NULL; }
@@ -29,7 +30,7 @@ private:
QLineEdit *m_pedTime;
QTimer m_timer;
QSqlDatabase m_db;
static const int C_CRAWLER_MAX = 2;
static const int C_CRAWLER_MAX = 3;
SManage *m_pManage[C_CRAWLER_MAX];
QListWidget *m_pResultList;
QString m_strFileName;
@@ -37,7 +38,8 @@ private:
QVector <QString> m_vecSelect;
SNaverCafeManage *m_pNaverCafe;
SNaverBlogManage *m_pNaverBlog;
int m_nStartTime,m_nRangeTime;
SDaumCafeManage *m_pDaumCafe;
int m_nStartTime,m_nRangeTime,m_nPlatform;
private:
QString makeCafeGetListQuery(QString _str,QDate _date,int _nPage);
private slots: