네이버 뉴스 수정
네이버 블로그 본문 수정 git-svn-id: svn://192.168.0.12/source@231 8346c931-da38-4b9b-9d4c-e48b93cbd075
This commit is contained in:
@@ -5,11 +5,12 @@
|
||||
#include <QByteArray>
|
||||
#include <QDebug>
|
||||
#include <QTimer>
|
||||
#include <QThread>
|
||||
|
||||
using namespace std;
|
||||
|
||||
const int BLOG_RETRY_MAX = 4;
|
||||
const int BLOG_RETRY_INTERVAL = 3000;
|
||||
const int RETRY_MAX = 4;
|
||||
const int RETRY_INTERVAL = 3000;
|
||||
|
||||
struct SProxyList
|
||||
{
|
||||
@@ -20,7 +21,7 @@ struct SProxyList
|
||||
SCrawler::SCrawler():QObject()
|
||||
{
|
||||
m_page = new QWebPage;
|
||||
m_nBlogRetryCount = 0;
|
||||
m_nRetryCount = 0;
|
||||
connect(m_page, SIGNAL(loadFinished(bool)), this, SLOT(saveResult(bool)));
|
||||
}
|
||||
|
||||
@@ -79,6 +80,20 @@ void SCrawler::load(QStringList _strlistArgv)
|
||||
m_nSelect = E_NAVER_BLOG_REPLY;
|
||||
}
|
||||
|
||||
if (_strlistArgv[1] == "news_data")
|
||||
{
|
||||
m_strUrl = _strlistArgv[2];
|
||||
m_nSelect = E_NAVER_NEWS_DATA;
|
||||
m_strKeywordID = _strlistArgv[4];
|
||||
}
|
||||
|
||||
if (_strlistArgv[1] == "news_comm")
|
||||
{
|
||||
m_strUrl = _strlistArgv[2];
|
||||
m_nSelect = E_NAVER_NEWS_REPLY;
|
||||
}
|
||||
|
||||
|
||||
if (_strlistArgv.size() > 3)
|
||||
m_strTable = "data_" + _strlistArgv[3];
|
||||
}
|
||||
@@ -173,10 +188,27 @@ void SCrawler::saveResult(bool ok)
|
||||
emit finished();
|
||||
return;
|
||||
}
|
||||
|
||||
qDebug() << "load complete";
|
||||
switch(m_nSelect)
|
||||
{
|
||||
case E_NAVER_NEWS_LIST:saveFrameNewsList(m_page->mainFrame());break;
|
||||
case E_NAVER_NEWS_DATA:
|
||||
{
|
||||
static bool loaded = false;
|
||||
if(!loaded)
|
||||
{
|
||||
saveFrameNewsUrl(m_page->mainFrame());
|
||||
bodydata.sendDB();
|
||||
}
|
||||
loaded = true;
|
||||
break;
|
||||
}
|
||||
case E_NAVER_NEWS_REPLY:
|
||||
{
|
||||
if(!saveFrameNewsComment(m_page->mainFrame()))
|
||||
return;
|
||||
break;
|
||||
}
|
||||
case E_NAVER_CAFE_LIST:saveFrameCafeList(m_page->mainFrame());break;
|
||||
case E_NAVER_CAFE_DATA:
|
||||
{
|
||||
@@ -236,6 +268,7 @@ void SCrawler::saveResult(bool ok)
|
||||
|
||||
break;
|
||||
case E_NAVER_BLOG_REPLY:
|
||||
case E_NAVER_NEWS_REPLY:
|
||||
case E_DAUM_BLOG_REPLY:
|
||||
if (m_bUse)
|
||||
{
|
||||
@@ -247,6 +280,7 @@ void SCrawler::saveResult(bool ok)
|
||||
case E_NAVER_BLOG_BODY:
|
||||
case E_DAUM_CAFE_DATA:
|
||||
case E_DAUM_BLOG_BODY:
|
||||
case E_NAVER_NEWS_DATA:
|
||||
if (m_bUse == false)
|
||||
{
|
||||
cout << "fail";
|
||||
@@ -585,7 +619,8 @@ bool SCrawler::saveFrameUrl(QWebFrame *frame)
|
||||
}
|
||||
|
||||
{
|
||||
QWebElement body = Find(post,"div","class","post-view pcol2 _param(1)");
|
||||
//QWebElement body = Find(post,"div","class","post-view pcol2 _param(1)");
|
||||
QWebElement body = post.findFirst("div[class^='post-view pcol2 _param(1)']");
|
||||
if (body.toPlainText().isEmpty()==false)
|
||||
{
|
||||
str[E_DATA_DATA] = body.toPlainText();
|
||||
@@ -612,11 +647,11 @@ bool SCrawler::saveFrameUrl(QWebFrame *frame)
|
||||
|
||||
|
||||
//retry if profile is empty and sympathy is empty
|
||||
if((strProfile.isEmpty() || sympathy.isEmpty()) && (m_nBlogRetryCount < BLOG_RETRY_MAX))
|
||||
if((strProfile.isEmpty() || sympathy.isEmpty()) && (m_nRetryCount < RETRY_MAX))
|
||||
{
|
||||
m_nBlogRetryCount++;
|
||||
qDebug() << m_nBlogRetryCount;
|
||||
QTimer::singleShot(BLOG_RETRY_INTERVAL, this, SLOT(crawlBlog()));
|
||||
m_nRetryCount++;
|
||||
qDebug() << m_nRetryCount;
|
||||
QTimer::singleShot(RETRY_INTERVAL, this, SLOT(reloadPage()));
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -661,7 +696,7 @@ bool SCrawler::saveFrameUrl(QWebFrame *frame)
|
||||
return b_ok;
|
||||
}
|
||||
|
||||
void SCrawler::crawlBlog()
|
||||
void SCrawler::reloadPage()
|
||||
{
|
||||
saveResult(true);
|
||||
}
|
||||
@@ -1076,7 +1111,6 @@ void SCrawler::saveFrameDaumCafeList(QWebFrame *frame)
|
||||
{
|
||||
if (eleSubUrl.attribute("class") == "f_url")
|
||||
strUrl = eleSubUrl.attribute("href");
|
||||
|
||||
}
|
||||
|
||||
if (strUrl.split("/").at(2) == "cafe.daum.net")
|
||||
@@ -1480,6 +1514,320 @@ void SCrawler::saveFrameNewsList(QWebFrame *frame)
|
||||
m_bUse = true;
|
||||
}
|
||||
|
||||
|
||||
void SCrawler::saveFrameNewsUrl(QWebFrame *frame)
|
||||
{
|
||||
if (m_bUse) return;
|
||||
|
||||
{
|
||||
QString strQuery = "delete from ";
|
||||
strQuery += m_strTable + " where article_url = '";
|
||||
strQuery += m_strUrl + "'";
|
||||
QSqlQuery query;
|
||||
if(query.exec(strQuery.toUtf8()) == false)
|
||||
{
|
||||
cout << query.lastError().text().toStdString();
|
||||
cout << query.lastQuery().toStdString();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
QString strTitle,strDate,strData,strPlatID,strPlatTitle,strlike;
|
||||
{
|
||||
QWebElement element = Find(frame->documentElement(),"div","class","article_info");
|
||||
{
|
||||
strTitle = Find(element,"h3","id","articleTitle").toPlainText(); // Title;
|
||||
strDate = Find(element,"span","class","t11").toPlainText(); // Date
|
||||
}
|
||||
strData = Find(frame->documentElement(),"div","id","articleBodyContents").toPlainText();
|
||||
strlike = Find(frame->documentElement(),"div","class","u_likeit_module").toPlainText();
|
||||
//entertainment
|
||||
if (strTitle.isEmpty())
|
||||
{
|
||||
QWebElement elementTitle = Find(frame->documentElement(),"div","class","end_ct_area");
|
||||
strTitle = Find(elementTitle,"p","class","end_tit").toPlainText();
|
||||
}
|
||||
//entertainment
|
||||
if (strDate.isEmpty()) strDate = Find(element,"em").toPlainText();
|
||||
if (strData.isEmpty()) strData = Find(frame->documentElement(),"div","id","articeBody").toPlainText();
|
||||
|
||||
if (strlike.isEmpty() == false) strData += "\r\nlike(" + QString::number(strlike.toInt()) + ")";
|
||||
else
|
||||
{
|
||||
//Debug("out.html",m_page->mainFrame()->toHtml());
|
||||
}
|
||||
|
||||
element = Find(frame->documentElement(),"div","class","press_logo");
|
||||
{
|
||||
strPlatID = Find(element,"a").attribute("href");
|
||||
strPlatTitle = Find(element,"img").attribute("alt");
|
||||
QStringList strlistPlat = strPlatID.split(".");
|
||||
if(strlistPlat.size() > 2)
|
||||
{
|
||||
if (strlistPlat.at(0) == QString("http://www"))
|
||||
strPlatID = strlistPlat.at(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
bodydata.setTable(m_strTable);
|
||||
bodydata.setData(bodydata.GetSafeUtf(strTitle), SCrawlerData::ARTICLE_TITLE);
|
||||
bodydata.setData(bodydata.GetSafeUtf(strData), SCrawlerData::ARTICLE_DATA);
|
||||
bodydata.setData(strPlatID,SCrawlerData::PLATFORM_ID);
|
||||
bodydata.setData(strPlatTitle,SCrawlerData::PLATFORM_TITLE);
|
||||
bodydata.setData(strDate, SCrawlerData::ARTICLE_DATE);
|
||||
bodydata.setData("naver", SCrawlerData::PLATFORM_NAME);
|
||||
bodydata.setData("news", SCrawlerData::PLATFORM_FORM);
|
||||
bodydata.setData("body", SCrawlerData::ARTICLE_FORM);
|
||||
bodydata.setData(m_strUrl, SCrawlerData::ARTICLE_URL);
|
||||
bodydata.setData(m_strKeywordID, SCrawlerData::KEYWORD_ID);
|
||||
|
||||
m_bUse = true;
|
||||
|
||||
}
|
||||
|
||||
bool SCrawler::saveFrameNewsComment(QWebFrame *frame)
|
||||
{
|
||||
if (m_bUse) return true;
|
||||
static bool bReplyDone = false;
|
||||
static bool bReplyReplyDone = false;
|
||||
static int reply_index = 0;
|
||||
|
||||
qDebug() << "executed";
|
||||
|
||||
QWebElement a = Find(frame->documentElement(), "a", "class", "u_cbox_btn_more __cbox_page_button");
|
||||
|
||||
if(!a.isNull())
|
||||
{
|
||||
|
||||
while(!bReplyDone)
|
||||
{
|
||||
QWebElement current = Find(a, "em", "class", "u_cbox_page_on __cbox_page_current");
|
||||
QWebElement total = Find(a, "em", "class", "u_cbox_page_total __cbox_page_total");
|
||||
QString str_current = current.toPlainText();
|
||||
QString str_total = total.toPlainText();
|
||||
bool ok;
|
||||
|
||||
int n_current = str_current.replace(",", "").toInt(&ok);
|
||||
if(!ok)
|
||||
break;
|
||||
|
||||
int n_total = str_total.replace(",", "").toInt(&ok);
|
||||
if(!ok)
|
||||
break;
|
||||
|
||||
if(n_current >= n_total)
|
||||
{
|
||||
bReplyDone = true;
|
||||
break;
|
||||
}
|
||||
a.evaluateJavaScript("var evObj = new Event('click', {bubbles: true, cancelable: true, view: window}); this.dispatchEvent(evObj);");
|
||||
QTimer::singleShot(100, this, SLOT(reloadPage()));
|
||||
qDebug() << "load comments";
|
||||
return false;
|
||||
}
|
||||
|
||||
QWebElementCollection reply_btns = frame->findAllElements("a[class='u_cbox_btn_reply']");
|
||||
for(;reply_index < reply_btns.count() ; reply_index++)
|
||||
{
|
||||
QWebElement btn = Find(reply_btns[reply_index], "span", "class", "u_cbox_reply_cnt");
|
||||
if(btn.isNull())
|
||||
continue;
|
||||
else
|
||||
{
|
||||
btn.evaluateJavaScript("var evObj = new Event('click', {bubbles: true, cancelable: true, view: window}); this.dispatchEvent(evObj);");
|
||||
QTimer::singleShot(100, this, SLOT(reloadPage()));
|
||||
//reply_index += 1;
|
||||
qDebug() << reply_index;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
/*
|
||||
foreach(QWebElement a, reply_btns)
|
||||
{
|
||||
QWebElement btn = Find(a, "span", "class", "u_cbox_reply_cnt");
|
||||
if(btn.isNull())
|
||||
continue;
|
||||
else
|
||||
{
|
||||
btn.evaluateJavaScript("var evObj = new Event('click', {bubbles: true, cancelable: true, view: window}); this.dispatchEvent(evObj);");
|
||||
//QTimer::singleShot(100, this, SLOT(reloadPage()));
|
||||
qDebug() << "qq";
|
||||
//return false;
|
||||
|
||||
}
|
||||
}
|
||||
*/
|
||||
}
|
||||
else
|
||||
{
|
||||
if(m_nRetryCount < RETRY_MAX)
|
||||
{
|
||||
m_nRetryCount++;
|
||||
qDebug() << m_nRetryCount;
|
||||
QTimer::singleShot(RETRY_INTERVAL, this, SLOT(reloadPage()));
|
||||
return false;
|
||||
}
|
||||
else
|
||||
{
|
||||
m_bUse = true;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
QWebElement logo = Find(frame->documentElement(),"div","class","press_logo");
|
||||
QString strPlatID, strPlatTitle;
|
||||
{
|
||||
strPlatID = Find(logo,"a").attribute("href");
|
||||
strPlatTitle = Find(logo,"img").attribute("alt");
|
||||
}
|
||||
QStringList strlistPlat = strPlatID.split(".");
|
||||
if(strlistPlat.size() > 2)
|
||||
{
|
||||
if (strlistPlat.at(0) == QString("http://www"))
|
||||
strPlatID = strlistPlat.at(1);
|
||||
}
|
||||
//QWebElement ul = frame->findFirstElement("div[class='u_cbox_content_wrap']>ul[class='u_cbox_list']");
|
||||
QWebElementCollection lis = frame->findAllElements("div[class='u_cbox_content_wrap']>ul[class='u_cbox_list']>li");
|
||||
int order = 0;
|
||||
foreach(QWebElement li, lis)
|
||||
{
|
||||
qDebug() << "li";
|
||||
QWebElement comment_box = li.findFirst("div[class='u_cbox_comment_box']");
|
||||
QString strParent;
|
||||
{
|
||||
QString strID, strNick, strData, strLike, strDislike, strDate;
|
||||
strData = Find(comment_box, "span", "class", "u_cbox_contents").toPlainText();
|
||||
strNick = strParent = strID = Find(comment_box, "span", "class", "u_cbox_name").toPlainText();
|
||||
strLike = Find(comment_box, "em", "class", "u_cbox_cnt_recomm").toPlainText().replace(",", "");
|
||||
strDislike = Find(comment_box, "em", "class", "u_cbox_cnt_unrecomm").toPlainText().replace(",", "");
|
||||
strData += "\n(goodCount:" + strLike +")\n(badCount:" + strDislike + ")";
|
||||
|
||||
strDate = Find(comment_box, "span", "class", "u_cbox_date").toPlainText();
|
||||
if(strDate.contains(":"))
|
||||
strDate += ":00";
|
||||
else
|
||||
{
|
||||
QDateTime current_time = QDateTime::currentDateTime();
|
||||
QRegExp rx("(\\d+)");
|
||||
int pos = 0;
|
||||
QString strTime;
|
||||
while ((pos = rx.indexIn(strDate, pos)) != -1)
|
||||
{
|
||||
strTime = rx.cap(1);
|
||||
pos += rx.matchedLength();
|
||||
}
|
||||
|
||||
if(strDate.contains("시간"))
|
||||
{
|
||||
current_time = current_time.addSecs(-(60 * 60 * strTime.toInt()));
|
||||
}
|
||||
else if(strDate.contains("일"))
|
||||
{
|
||||
current_time = current_time.addDays(-(strTime.toInt()));
|
||||
}
|
||||
else if(strDate.contains("분"))
|
||||
{
|
||||
current_time = current_time.addDays(-(60 * strTime.toInt()));
|
||||
}
|
||||
else
|
||||
{
|
||||
;
|
||||
}
|
||||
strDate = current_time.toString("yyyy-MM-dd hh:mm:ss");
|
||||
qDebug() << strDate;
|
||||
}
|
||||
{
|
||||
QSqlQuery query;
|
||||
query.prepare(QString("insert into " + m_strTable +
|
||||
" (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_order,platform_id,platform_title, article_date) "
|
||||
"VALUES ('naver','news','reply',:URL,:ID,:NICK,:DATA,:ROWNUM,:PLATFORMID,:TITLE,:DATE)").toUtf8());
|
||||
|
||||
query.bindValue(":URL",m_strUrl.replace("&m_view=1","").toUtf8());
|
||||
query.bindValue(":ID",strID.toUtf8());
|
||||
query.bindValue(":NICK",strNick.toUtf8());
|
||||
query.bindValue(":DATA",strData.toUtf8());
|
||||
query.bindValue(":ROWNUM",order++);
|
||||
query.bindValue(":PLATFORMID",strPlatID.toUtf8());
|
||||
query.bindValue(":TITLE",strPlatTitle.toUtf8());
|
||||
query.bindValue(":DATE", strDate.toUtf8());
|
||||
if (query.exec()==false)
|
||||
cout << "error : " << query.lastError().text().toStdString();
|
||||
}
|
||||
}
|
||||
QWebElement reply_area = li.findFirst("div[class='u_cbox_reply_area']");
|
||||
QWebElementCollection sub_lis = reply_area.findAll("ul[class='u_cbox_list']>li");
|
||||
|
||||
foreach(QWebElement sub_li, sub_lis)
|
||||
{
|
||||
QString strID, strNick, strData, strDate;
|
||||
strData = Find(sub_li, "span", "class", "u_cbox_contents").toPlainText();
|
||||
strNick = strID = Find(sub_li, "span", "class", "u_cbox_name").toPlainText();
|
||||
strDate = Find(comment_box, "span", "class", "u_cbox_date").toPlainText();
|
||||
if(strDate.contains(":"))
|
||||
strDate += ":00";
|
||||
else
|
||||
{
|
||||
QDateTime current_time = QDateTime::currentDateTime();
|
||||
QRegExp rx("(\\d+)");
|
||||
int pos = 0;
|
||||
QString strTime;
|
||||
while ((pos = rx.indexIn(strDate, pos)) != -1)
|
||||
{
|
||||
strTime = rx.cap(1);
|
||||
pos += rx.matchedLength();
|
||||
}
|
||||
|
||||
if(strDate.contains("시간"))
|
||||
{
|
||||
current_time = current_time.addSecs(-(60 * 60 * strTime.toInt()));
|
||||
}
|
||||
else if(strDate.contains("일"))
|
||||
{
|
||||
current_time = current_time.addDays(-(strTime.toInt()));
|
||||
}
|
||||
else if(strDate.contains("분"))
|
||||
{
|
||||
current_time = current_time.addDays(-(60 * strTime.toInt()));
|
||||
}
|
||||
else
|
||||
{
|
||||
;
|
||||
}
|
||||
strDate = current_time.toString("yyyy-MM-dd hh:mm:ss");
|
||||
}
|
||||
|
||||
{
|
||||
QSqlQuery query;
|
||||
query.prepare(QString("insert into " + m_strTable +
|
||||
" (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_order,platform_id,platform_title, article_date, article_parent) "
|
||||
"VALUES ('naver','news','reply',:URL,:ID,:NICK,:DATA,:ROWNUM,:PLATFORMID,:TITLE,:DATE,:PARENT)").toUtf8());
|
||||
|
||||
query.bindValue(":URL",m_strUrl.replace("&m_view=1","").toUtf8());
|
||||
query.bindValue(":ID",strID.toUtf8());
|
||||
query.bindValue(":NICK",strNick.toUtf8());
|
||||
query.bindValue(":DATA",strData.toUtf8());
|
||||
query.bindValue(":ROWNUM",order++);
|
||||
query.bindValue(":PLATFORMID",strPlatID.toUtf8());
|
||||
query.bindValue(":TITLE",strPlatTitle.toUtf8());
|
||||
query.bindValue(":DATE", strDate.toUtf8());
|
||||
query.bindValue(":PARENT", strParent.toUtf8());
|
||||
if (query.exec()==false)
|
||||
cout << "error : " << query.lastError().text().toStdString();
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
qDebug() << "lis count: " << lis.count();
|
||||
}
|
||||
|
||||
//Debug("c:\\data\\replytest.html", frame->toHtml());
|
||||
m_bUse = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
QWebElement SCrawler::Find(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind)
|
||||
{
|
||||
QWebElementCollection elements = _FindElement.findAll(_strElement);
|
||||
|
||||
Reference in New Issue
Block a user