네이버 뉴스 수정

네이버 블로그 본문 수정

git-svn-id: svn://192.168.0.12/source@231 8346c931-da38-4b9b-9d4c-e48b93cbd075
This commit is contained in:
admin
2015-12-24 01:39:40 +00:00
parent 820fa031e2
commit 6b3521002a
4 changed files with 378 additions and 18 deletions

View File

@@ -5,11 +5,12 @@
#include <QByteArray>
#include <QDebug>
#include <QTimer>
#include <QThread>
using namespace std;
const int BLOG_RETRY_MAX = 4;
const int BLOG_RETRY_INTERVAL = 3000;
const int RETRY_MAX = 4;
const int RETRY_INTERVAL = 3000;
struct SProxyList
{
@@ -20,7 +21,7 @@ struct SProxyList
SCrawler::SCrawler():QObject()
{
m_page = new QWebPage;
m_nBlogRetryCount = 0;
m_nRetryCount = 0;
connect(m_page, SIGNAL(loadFinished(bool)), this, SLOT(saveResult(bool)));
}
@@ -79,6 +80,20 @@ void SCrawler::load(QStringList _strlistArgv)
m_nSelect = E_NAVER_BLOG_REPLY;
}
if (_strlistArgv[1] == "news_data")
{
m_strUrl = _strlistArgv[2];
m_nSelect = E_NAVER_NEWS_DATA;
m_strKeywordID = _strlistArgv[4];
}
if (_strlistArgv[1] == "news_comm")
{
m_strUrl = _strlistArgv[2];
m_nSelect = E_NAVER_NEWS_REPLY;
}
if (_strlistArgv.size() > 3)
m_strTable = "data_" + _strlistArgv[3];
}
@@ -173,10 +188,27 @@ void SCrawler::saveResult(bool ok)
emit finished();
return;
}
qDebug() << "load complete";
switch(m_nSelect)
{
case E_NAVER_NEWS_LIST:saveFrameNewsList(m_page->mainFrame());break;
case E_NAVER_NEWS_DATA:
{
static bool loaded = false;
if(!loaded)
{
saveFrameNewsUrl(m_page->mainFrame());
bodydata.sendDB();
}
loaded = true;
break;
}
case E_NAVER_NEWS_REPLY:
{
if(!saveFrameNewsComment(m_page->mainFrame()))
return;
break;
}
case E_NAVER_CAFE_LIST:saveFrameCafeList(m_page->mainFrame());break;
case E_NAVER_CAFE_DATA:
{
@@ -236,6 +268,7 @@ void SCrawler::saveResult(bool ok)
break;
case E_NAVER_BLOG_REPLY:
case E_NAVER_NEWS_REPLY:
case E_DAUM_BLOG_REPLY:
if (m_bUse)
{
@@ -247,6 +280,7 @@ void SCrawler::saveResult(bool ok)
case E_NAVER_BLOG_BODY:
case E_DAUM_CAFE_DATA:
case E_DAUM_BLOG_BODY:
case E_NAVER_NEWS_DATA:
if (m_bUse == false)
{
cout << "fail";
@@ -585,7 +619,8 @@ bool SCrawler::saveFrameUrl(QWebFrame *frame)
}
{
QWebElement body = Find(post,"div","class","post-view pcol2 _param(1)");
//QWebElement body = Find(post,"div","class","post-view pcol2 _param(1)");
QWebElement body = post.findFirst("div[class^='post-view pcol2 _param(1)']");
if (body.toPlainText().isEmpty()==false)
{
str[E_DATA_DATA] = body.toPlainText();
@@ -612,11 +647,11 @@ bool SCrawler::saveFrameUrl(QWebFrame *frame)
//retry if profile is empty and sympathy is empty
if((strProfile.isEmpty() || sympathy.isEmpty()) && (m_nBlogRetryCount < BLOG_RETRY_MAX))
if((strProfile.isEmpty() || sympathy.isEmpty()) && (m_nRetryCount < RETRY_MAX))
{
m_nBlogRetryCount++;
qDebug() << m_nBlogRetryCount;
QTimer::singleShot(BLOG_RETRY_INTERVAL, this, SLOT(crawlBlog()));
m_nRetryCount++;
qDebug() << m_nRetryCount;
QTimer::singleShot(RETRY_INTERVAL, this, SLOT(reloadPage()));
return false;
}
}
@@ -661,7 +696,7 @@ bool SCrawler::saveFrameUrl(QWebFrame *frame)
return b_ok;
}
void SCrawler::crawlBlog()
void SCrawler::reloadPage()
{
saveResult(true);
}
@@ -1076,7 +1111,6 @@ void SCrawler::saveFrameDaumCafeList(QWebFrame *frame)
{
if (eleSubUrl.attribute("class") == "f_url")
strUrl = eleSubUrl.attribute("href");
}
if (strUrl.split("/").at(2) == "cafe.daum.net")
@@ -1480,6 +1514,320 @@ void SCrawler::saveFrameNewsList(QWebFrame *frame)
m_bUse = true;
}
void SCrawler::saveFrameNewsUrl(QWebFrame *frame)
{
if (m_bUse) return;
{
QString strQuery = "delete from ";
strQuery += m_strTable + " where article_url = '";
strQuery += m_strUrl + "'";
QSqlQuery query;
if(query.exec(strQuery.toUtf8()) == false)
{
cout << query.lastError().text().toStdString();
cout << query.lastQuery().toStdString();
}
}
QString strTitle,strDate,strData,strPlatID,strPlatTitle,strlike;
{
QWebElement element = Find(frame->documentElement(),"div","class","article_info");
{
strTitle = Find(element,"h3","id","articleTitle").toPlainText(); // Title;
strDate = Find(element,"span","class","t11").toPlainText(); // Date
}
strData = Find(frame->documentElement(),"div","id","articleBodyContents").toPlainText();
strlike = Find(frame->documentElement(),"div","class","u_likeit_module").toPlainText();
//entertainment
if (strTitle.isEmpty())
{
QWebElement elementTitle = Find(frame->documentElement(),"div","class","end_ct_area");
strTitle = Find(elementTitle,"p","class","end_tit").toPlainText();
}
//entertainment
if (strDate.isEmpty()) strDate = Find(element,"em").toPlainText();
if (strData.isEmpty()) strData = Find(frame->documentElement(),"div","id","articeBody").toPlainText();
if (strlike.isEmpty() == false) strData += "\r\nlike(" + QString::number(strlike.toInt()) + ")";
else
{
//Debug("out.html",m_page->mainFrame()->toHtml());
}
element = Find(frame->documentElement(),"div","class","press_logo");
{
strPlatID = Find(element,"a").attribute("href");
strPlatTitle = Find(element,"img").attribute("alt");
QStringList strlistPlat = strPlatID.split(".");
if(strlistPlat.size() > 2)
{
if (strlistPlat.at(0) == QString("http://www"))
strPlatID = strlistPlat.at(1);
}
}
}
bodydata.setTable(m_strTable);
bodydata.setData(bodydata.GetSafeUtf(strTitle), SCrawlerData::ARTICLE_TITLE);
bodydata.setData(bodydata.GetSafeUtf(strData), SCrawlerData::ARTICLE_DATA);
bodydata.setData(strPlatID,SCrawlerData::PLATFORM_ID);
bodydata.setData(strPlatTitle,SCrawlerData::PLATFORM_TITLE);
bodydata.setData(strDate, SCrawlerData::ARTICLE_DATE);
bodydata.setData("naver", SCrawlerData::PLATFORM_NAME);
bodydata.setData("news", SCrawlerData::PLATFORM_FORM);
bodydata.setData("body", SCrawlerData::ARTICLE_FORM);
bodydata.setData(m_strUrl, SCrawlerData::ARTICLE_URL);
bodydata.setData(m_strKeywordID, SCrawlerData::KEYWORD_ID);
m_bUse = true;
}
bool SCrawler::saveFrameNewsComment(QWebFrame *frame)
{
if (m_bUse) return true;
static bool bReplyDone = false;
static bool bReplyReplyDone = false;
static int reply_index = 0;
qDebug() << "executed";
QWebElement a = Find(frame->documentElement(), "a", "class", "u_cbox_btn_more __cbox_page_button");
if(!a.isNull())
{
while(!bReplyDone)
{
QWebElement current = Find(a, "em", "class", "u_cbox_page_on __cbox_page_current");
QWebElement total = Find(a, "em", "class", "u_cbox_page_total __cbox_page_total");
QString str_current = current.toPlainText();
QString str_total = total.toPlainText();
bool ok;
int n_current = str_current.replace(",", "").toInt(&ok);
if(!ok)
break;
int n_total = str_total.replace(",", "").toInt(&ok);
if(!ok)
break;
if(n_current >= n_total)
{
bReplyDone = true;
break;
}
a.evaluateJavaScript("var evObj = new Event('click', {bubbles: true, cancelable: true, view: window}); this.dispatchEvent(evObj);");
QTimer::singleShot(100, this, SLOT(reloadPage()));
qDebug() << "load comments";
return false;
}
QWebElementCollection reply_btns = frame->findAllElements("a[class='u_cbox_btn_reply']");
for(;reply_index < reply_btns.count() ; reply_index++)
{
QWebElement btn = Find(reply_btns[reply_index], "span", "class", "u_cbox_reply_cnt");
if(btn.isNull())
continue;
else
{
btn.evaluateJavaScript("var evObj = new Event('click', {bubbles: true, cancelable: true, view: window}); this.dispatchEvent(evObj);");
QTimer::singleShot(100, this, SLOT(reloadPage()));
//reply_index += 1;
qDebug() << reply_index;
return false;
}
}
/*
foreach(QWebElement a, reply_btns)
{
QWebElement btn = Find(a, "span", "class", "u_cbox_reply_cnt");
if(btn.isNull())
continue;
else
{
btn.evaluateJavaScript("var evObj = new Event('click', {bubbles: true, cancelable: true, view: window}); this.dispatchEvent(evObj);");
//QTimer::singleShot(100, this, SLOT(reloadPage()));
qDebug() << "qq";
//return false;
}
}
*/
}
else
{
if(m_nRetryCount < RETRY_MAX)
{
m_nRetryCount++;
qDebug() << m_nRetryCount;
QTimer::singleShot(RETRY_INTERVAL, this, SLOT(reloadPage()));
return false;
}
else
{
m_bUse = true;
return true;
}
}
{
QWebElement logo = Find(frame->documentElement(),"div","class","press_logo");
QString strPlatID, strPlatTitle;
{
strPlatID = Find(logo,"a").attribute("href");
strPlatTitle = Find(logo,"img").attribute("alt");
}
QStringList strlistPlat = strPlatID.split(".");
if(strlistPlat.size() > 2)
{
if (strlistPlat.at(0) == QString("http://www"))
strPlatID = strlistPlat.at(1);
}
//QWebElement ul = frame->findFirstElement("div[class='u_cbox_content_wrap']>ul[class='u_cbox_list']");
QWebElementCollection lis = frame->findAllElements("div[class='u_cbox_content_wrap']>ul[class='u_cbox_list']>li");
int order = 0;
foreach(QWebElement li, lis)
{
qDebug() << "li";
QWebElement comment_box = li.findFirst("div[class='u_cbox_comment_box']");
QString strParent;
{
QString strID, strNick, strData, strLike, strDislike, strDate;
strData = Find(comment_box, "span", "class", "u_cbox_contents").toPlainText();
strNick = strParent = strID = Find(comment_box, "span", "class", "u_cbox_name").toPlainText();
strLike = Find(comment_box, "em", "class", "u_cbox_cnt_recomm").toPlainText().replace(",", "");
strDislike = Find(comment_box, "em", "class", "u_cbox_cnt_unrecomm").toPlainText().replace(",", "");
strData += "\n(goodCount:" + strLike +")\n(badCount:" + strDislike + ")";
strDate = Find(comment_box, "span", "class", "u_cbox_date").toPlainText();
if(strDate.contains(":"))
strDate += ":00";
else
{
QDateTime current_time = QDateTime::currentDateTime();
QRegExp rx("(\\d+)");
int pos = 0;
QString strTime;
while ((pos = rx.indexIn(strDate, pos)) != -1)
{
strTime = rx.cap(1);
pos += rx.matchedLength();
}
if(strDate.contains("시간"))
{
current_time = current_time.addSecs(-(60 * 60 * strTime.toInt()));
}
else if(strDate.contains(""))
{
current_time = current_time.addDays(-(strTime.toInt()));
}
else if(strDate.contains(""))
{
current_time = current_time.addDays(-(60 * strTime.toInt()));
}
else
{
;
}
strDate = current_time.toString("yyyy-MM-dd hh:mm:ss");
qDebug() << strDate;
}
{
QSqlQuery query;
query.prepare(QString("insert into " + m_strTable +
" (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_order,platform_id,platform_title, article_date) "
"VALUES ('naver','news','reply',:URL,:ID,:NICK,:DATA,:ROWNUM,:PLATFORMID,:TITLE,:DATE)").toUtf8());
query.bindValue(":URL",m_strUrl.replace("&m_view=1","").toUtf8());
query.bindValue(":ID",strID.toUtf8());
query.bindValue(":NICK",strNick.toUtf8());
query.bindValue(":DATA",strData.toUtf8());
query.bindValue(":ROWNUM",order++);
query.bindValue(":PLATFORMID",strPlatID.toUtf8());
query.bindValue(":TITLE",strPlatTitle.toUtf8());
query.bindValue(":DATE", strDate.toUtf8());
if (query.exec()==false)
cout << "error : " << query.lastError().text().toStdString();
}
}
QWebElement reply_area = li.findFirst("div[class='u_cbox_reply_area']");
QWebElementCollection sub_lis = reply_area.findAll("ul[class='u_cbox_list']>li");
foreach(QWebElement sub_li, sub_lis)
{
QString strID, strNick, strData, strDate;
strData = Find(sub_li, "span", "class", "u_cbox_contents").toPlainText();
strNick = strID = Find(sub_li, "span", "class", "u_cbox_name").toPlainText();
strDate = Find(comment_box, "span", "class", "u_cbox_date").toPlainText();
if(strDate.contains(":"))
strDate += ":00";
else
{
QDateTime current_time = QDateTime::currentDateTime();
QRegExp rx("(\\d+)");
int pos = 0;
QString strTime;
while ((pos = rx.indexIn(strDate, pos)) != -1)
{
strTime = rx.cap(1);
pos += rx.matchedLength();
}
if(strDate.contains("시간"))
{
current_time = current_time.addSecs(-(60 * 60 * strTime.toInt()));
}
else if(strDate.contains(""))
{
current_time = current_time.addDays(-(strTime.toInt()));
}
else if(strDate.contains(""))
{
current_time = current_time.addDays(-(60 * strTime.toInt()));
}
else
{
;
}
strDate = current_time.toString("yyyy-MM-dd hh:mm:ss");
}
{
QSqlQuery query;
query.prepare(QString("insert into " + m_strTable +
" (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_order,platform_id,platform_title, article_date, article_parent) "
"VALUES ('naver','news','reply',:URL,:ID,:NICK,:DATA,:ROWNUM,:PLATFORMID,:TITLE,:DATE,:PARENT)").toUtf8());
query.bindValue(":URL",m_strUrl.replace("&m_view=1","").toUtf8());
query.bindValue(":ID",strID.toUtf8());
query.bindValue(":NICK",strNick.toUtf8());
query.bindValue(":DATA",strData.toUtf8());
query.bindValue(":ROWNUM",order++);
query.bindValue(":PLATFORMID",strPlatID.toUtf8());
query.bindValue(":TITLE",strPlatTitle.toUtf8());
query.bindValue(":DATE", strDate.toUtf8());
query.bindValue(":PARENT", strParent.toUtf8());
if (query.exec()==false)
cout << "error : " << query.lastError().text().toStdString();
}
}
}
qDebug() << "lis count: " << lis.count();
}
//Debug("c:\\data\\replytest.html", frame->toHtml());
m_bUse = true;
return true;
}
QWebElement SCrawler::Find(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind)
{
QWebElementCollection elements = _FindElement.findAll(_strElement);

View File

@@ -19,7 +19,9 @@ public:
E_DAUM_BLOG_LIST,
E_DAUM_BLOG_BODY,
E_DAUM_BLOG_REPLY,
E_NAVER_NEWS_LIST
E_NAVER_NEWS_LIST,
E_NAVER_NEWS_DATA,
E_NAVER_NEWS_REPLY,
};
public:
SCrawler();
@@ -31,7 +33,7 @@ signals:
void finished();
private slots:
void saveResult(bool ok);
void crawlBlog();
void reloadPage();
private:
int m_nSelect;
QString m_strReper;
@@ -50,7 +52,7 @@ private:
bool m_bNothing;
QString m_strProxyIP;
int m_nProxyPort;
int m_nBlogRetryCount;
int m_nRetryCount;
QString SqlString(QString _str);
QString GetSafeUtf(QString _strData);
void saveFrameList(QWebFrame *frame);
@@ -64,13 +66,15 @@ private:
void saveFrameDaumBlogComment(QWebFrame *frame);
void saveFrameDaumCafeUrl(QWebFrame *frame);
void saveFrameNewsList(QWebFrame *frame);
void saveFrameNewsUrl(QWebFrame *frame);
bool saveFrameNewsComment(QWebFrame *frame);
int GetNumber(QString _str);
bool getProxyList(QString &_str);
void setProxy();
void deleteProxy();
QWebElement Find(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind);
QWebElement Find(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib="",const QString _strFind="");
QWebElement FindMid(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind, const int _strStart, const int _strLength);
QWebElement FindMid(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind, const int _strStart);
QWebElement FindRight(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind);
@@ -80,4 +84,5 @@ private:
void UpdateError(QString _strError);
};
#endif // SCRAWLER_H

View File

@@ -4,6 +4,7 @@
#include <QVariant>
#include <QSqlDatabase>
#include <QSqlError>
#include <QDebug>
using namespace std;
SCrawlerData::SCrawlerData()
{
@@ -26,6 +27,7 @@ SCrawlerData::SCrawlerData()
m_strColumn[PLATFORM_NAME] = "platform_name";
m_strColumn[PLATFORM_TITLE] = "platform_title";
m_strColumn[REPLY_URL] = "reply_url";
//m_strColumn[ETC] = "etc";
}
SCrawlerData::~SCrawlerData()
@@ -94,12 +96,15 @@ bool SCrawlerData::sendDB()
{
if(i == ARTICLE_ORDER)
query.bindValue(QString(":" + m_strColumn[i]), m_strData[i].trimmed().toInt());
query.bindValue(QString(":" + m_strColumn[i]), m_strData[i].trimmed().toUtf8());
else
query.bindValue(QString(":" + m_strColumn[i]), m_strData[i].trimmed().toUtf8());
}
if (query.exec()==false)
{
cout << "error : " << query.lastError().text().toStdString();
cout << endl << query.lastQuery().toStdString() << endl ;
return false;
}
return true;

View File

@@ -28,7 +28,8 @@ public:
PLATFORM_ID,
KEYWORD_ID,
REPLY_URL,
TOTAL_COUNT
//ETC,
TOTAL_COUNT,
};
private:
@@ -37,7 +38,6 @@ private:
QString m_strTable;
private:
QString GetSafeUtf(QString _strData);
QString getTable();
public:
@@ -45,6 +45,8 @@ public:
~SCrawlerData();
QStringList GetNumber(QString _str);
QString getData(int _num);
QString GetSafeUtf(QString _strData);
void setData(QString _str, int _num);
void clear();
void clear(int _num);