157 lines
5.5 KiB
C++
157 lines
5.5 KiB
C++
#include "snewscrawler.h"
|
||
|
||
#include <iostream>
|
||
#include <QNetworkRequest>
|
||
#include <QWebFrame>
|
||
#include <QWebElement>
|
||
#include <QWebElementCollection>
|
||
|
||
using namespace std;
|
||
|
||
#include <QFile>
|
||
#include <QTextStream>
|
||
|
||
void SNewsCrawler::Debug(QString _strFilename,QString _strData)
|
||
{
|
||
QFile file(_strFilename);
|
||
if (!file.open(QIODevice::WriteOnly | QIODevice::Text | QIODevice::Append))
|
||
return;
|
||
QTextStream out(&file);
|
||
out << _strData;
|
||
file.close();
|
||
}
|
||
|
||
SNewsCrawler::SNewsCrawler(QObject *parent) : QObject(parent) , m_bUse(false)
|
||
{
|
||
m_page = new QWebPage;
|
||
connect(m_page, SIGNAL(loadFinished(bool)), this, SLOT(saveResult(bool)));
|
||
}
|
||
|
||
SNewsCrawler::~SNewsCrawler()
|
||
{
|
||
|
||
}
|
||
|
||
void SNewsCrawler::load(QStringList _strlistArgv)
|
||
{
|
||
m_strUrl = _strlistArgv.at(0);
|
||
cout << m_strUrl.toStdString() << endl;
|
||
QUrl url = QUrl(m_strUrl);
|
||
if (url.scheme().isEmpty())
|
||
url.setScheme("http");
|
||
|
||
m_page->settings()->setAttribute(QWebSettings::AutoLoadImages,false);
|
||
QNetworkRequest *request = new QNetworkRequest;
|
||
request->setUrl(url);
|
||
|
||
m_data.setData(m_strUrl, SCrawlerData::ARTICLE_URL);
|
||
m_data.setTable("data_"+_strlistArgv.at(1));
|
||
m_data.setData(_strlistArgv.at(2), SCrawlerData::KEYWORD_ID);
|
||
/*
|
||
request->setRawHeader("Cache-Control","max-age=0, no-cache");
|
||
request->setRawHeader("Pragma","no-cache");
|
||
request->setRawHeader("Expires","Thu, 01 Jan 1970 16:00:00 GMT");
|
||
if (m_strReper.isEmpty() == false && m_nSelect == E_NAVER_CAFE_DATA)
|
||
request->setRawHeader("Referer",m_strReper.toLocal8Bit());
|
||
*/
|
||
request->setRawHeader("Accept-Language","ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4,zh-CN;q=0.2,zh;q=0.2");
|
||
m_page->mainFrame()->load(*request);
|
||
|
||
}
|
||
|
||
QWebElement SNewsCrawler::Find(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib="",const QString _strFind="")
|
||
{
|
||
QWebElementCollection elements = _FindElement.findAll(_strElement);
|
||
foreach (QWebElement element, elements)
|
||
{
|
||
if (element.attribute(_strAttrib) == _strFind)
|
||
{
|
||
return element;
|
||
}
|
||
}
|
||
QWebElement element;
|
||
return element;
|
||
}
|
||
|
||
void SNewsCrawler::saveResult(bool ok)
|
||
{
|
||
if (m_bUse) return;
|
||
if (!ok)
|
||
cout << "Failed loading";
|
||
else
|
||
{
|
||
QString strTitle,strDate,strData,strPlatID,strPlatTitle,strlike;
|
||
{
|
||
QWebElement element = Find(m_page->mainFrame()->documentElement(),"div","class","article_info");
|
||
{
|
||
strTitle = Find(element,"h3","id","articleTitle").toPlainText(); // Title;
|
||
strDate = Find(element,"span","class","t11").toPlainText(); // Date
|
||
}
|
||
strData = Find(m_page->mainFrame()->documentElement(),"div","id","articleBodyContents").toPlainText();
|
||
strlike = Find(m_page->mainFrame()->documentElement(),"div","class","u_likeit_module").toPlainText();
|
||
//entertainment
|
||
if (strTitle.isEmpty())
|
||
{
|
||
QWebElement elementTitle = Find(m_page->mainFrame()->documentElement(),"div","class","end_ct_area");
|
||
strTitle = Find(elementTitle,"p","class","end_tit").toPlainText();
|
||
}
|
||
//entertainment
|
||
if (strDate.isEmpty()) strDate = Find(element,"em").toPlainText();
|
||
if (strData.isEmpty()) strData = Find(m_page->mainFrame()->documentElement(),"div","id","articeBody").toPlainText();
|
||
|
||
if (strlike.isEmpty() == false) strData += "\r\nlike(" + QString::number(strlike.toInt()) + ")";
|
||
else
|
||
{
|
||
Debug("out.html",m_page->mainFrame()->toHtml());
|
||
}
|
||
|
||
element = Find(m_page->mainFrame()->documentElement(),"div","class","press_logo");
|
||
{
|
||
strPlatID = Find(element,"a").attribute("href");
|
||
strPlatTitle = Find(element,"img").attribute("alt");
|
||
QStringList strlistPlat = strPlatID.split(".");
|
||
if(strlistPlat.size() > 2)
|
||
{
|
||
if (strlistPlat.at(0) == QString("http://www"))
|
||
strPlatID = strlistPlat.at(1);
|
||
}
|
||
}
|
||
}
|
||
|
||
//platform_title,platform_id
|
||
|
||
m_data.deleteDB(m_strUrl,SCrawlerData::ARTICLE_URL);
|
||
m_data.setData(m_data.GetSafeUtf(strTitle), SCrawlerData::ARTICLE_TITLE);
|
||
m_data.setData(m_data.GetSafeUtf(strData), SCrawlerData::ARTICLE_DATA);
|
||
m_data.setData(strPlatID,SCrawlerData::PLATFORM_ID);
|
||
m_data.setData(strPlatTitle,SCrawlerData::PLATFORM_TITLE);
|
||
m_data.setData(strDate, SCrawlerData::ARTICLE_DATE);
|
||
m_data.setData("naver", SCrawlerData::PLATFORM_NAME);
|
||
m_data.setData("news", SCrawlerData::PLATFORM_FORM);
|
||
m_data.setData("body", SCrawlerData::ARTICLE_FORM);
|
||
m_data.sendDB();
|
||
saveFrame(m_page->mainFrame());
|
||
m_strUrl.split("&");
|
||
m_reply.SetUrl(m_strUrl);
|
||
m_reply.Start(&m_data);
|
||
if (m_bUse)
|
||
cout << "ok";
|
||
else
|
||
cout << "fail";
|
||
emit finished();
|
||
}
|
||
}
|
||
|
||
void SNewsCrawler::saveFrame(QWebFrame *frame)
|
||
{
|
||
if (m_bUse) return;
|
||
if (frame->frameName() == "ifrMemo")
|
||
{
|
||
m_reply.SetTotal(Find(frame->documentElement(),"strong","class","_totalcount").toPlainText().trimmed().replace(",","").toInt());
|
||
m_bUse = true;
|
||
}
|
||
foreach(QWebFrame *childFrame, frame->childFrames())
|
||
saveFrame(childFrame);
|
||
}
|
||
|