git-svn-id: svn://192.168.0.12/source@164 8346c931-da38-4b9b-9d4c-e48b93cbd075
This commit is contained in:
@@ -1,156 +0,0 @@
|
||||
#include "snewscrawler.h"
|
||||
|
||||
#include <iostream>
|
||||
#include <QNetworkRequest>
|
||||
#include <QWebFrame>
|
||||
#include <QWebElement>
|
||||
#include <QWebElementCollection>
|
||||
|
||||
using namespace std;
|
||||
|
||||
#include <QFile>
|
||||
#include <QTextStream>
|
||||
|
||||
void SNewsCrawler::Debug(QString _strFilename,QString _strData)
|
||||
{
|
||||
QFile file(_strFilename);
|
||||
if (!file.open(QIODevice::WriteOnly | QIODevice::Text | QIODevice::Append))
|
||||
return;
|
||||
QTextStream out(&file);
|
||||
out << _strData;
|
||||
file.close();
|
||||
}
|
||||
|
||||
SNewsCrawler::SNewsCrawler(QObject *parent) : QObject(parent) , m_bUse(false)
|
||||
{
|
||||
m_page = new QWebPage;
|
||||
connect(m_page, SIGNAL(loadFinished(bool)), this, SLOT(saveResult(bool)));
|
||||
}
|
||||
|
||||
SNewsCrawler::~SNewsCrawler()
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
void SNewsCrawler::load(QStringList _strlistArgv)
|
||||
{
|
||||
m_strUrl = _strlistArgv.at(0);
|
||||
cout << m_strUrl.toStdString() << endl;
|
||||
QUrl url = QUrl(m_strUrl);
|
||||
if (url.scheme().isEmpty())
|
||||
url.setScheme("http");
|
||||
|
||||
m_page->settings()->setAttribute(QWebSettings::AutoLoadImages,false);
|
||||
QNetworkRequest *request = new QNetworkRequest;
|
||||
request->setUrl(url);
|
||||
|
||||
m_data.setData(m_strUrl, SCrawlerData::ARTICLE_URL);
|
||||
m_data.setTable("data_"+_strlistArgv.at(1));
|
||||
m_data.setData(_strlistArgv.at(2), SCrawlerData::KEYWORD_ID);
|
||||
/*
|
||||
request->setRawHeader("Cache-Control","max-age=0, no-cache");
|
||||
request->setRawHeader("Pragma","no-cache");
|
||||
request->setRawHeader("Expires","Thu, 01 Jan 1970 16:00:00 GMT");
|
||||
if (m_strReper.isEmpty() == false && m_nSelect == E_NAVER_CAFE_DATA)
|
||||
request->setRawHeader("Referer",m_strReper.toLocal8Bit());
|
||||
*/
|
||||
request->setRawHeader("Accept-Language","ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4,zh-CN;q=0.2,zh;q=0.2");
|
||||
m_page->mainFrame()->load(*request);
|
||||
|
||||
}
|
||||
|
||||
QWebElement SNewsCrawler::Find(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib="",const QString _strFind="")
|
||||
{
|
||||
QWebElementCollection elements = _FindElement.findAll(_strElement);
|
||||
foreach (QWebElement element, elements)
|
||||
{
|
||||
if (element.attribute(_strAttrib) == _strFind)
|
||||
{
|
||||
return element;
|
||||
}
|
||||
}
|
||||
QWebElement element;
|
||||
return element;
|
||||
}
|
||||
|
||||
void SNewsCrawler::saveResult(bool ok)
|
||||
{
|
||||
if (m_bUse) return;
|
||||
if (!ok)
|
||||
cout << "Failed loading";
|
||||
else
|
||||
{
|
||||
QString strTitle,strDate,strData,strPlatID,strPlatTitle,strlike;
|
||||
{
|
||||
QWebElement element = Find(m_page->mainFrame()->documentElement(),"div","class","article_info");
|
||||
{
|
||||
strTitle = Find(element,"h3","id","articleTitle").toPlainText(); // Title;
|
||||
strDate = Find(element,"span","class","t11").toPlainText(); // Date
|
||||
}
|
||||
strData = Find(m_page->mainFrame()->documentElement(),"div","id","articleBodyContents").toPlainText();
|
||||
strlike = Find(m_page->mainFrame()->documentElement(),"div","class","u_likeit_module").toPlainText();
|
||||
//entertainment
|
||||
if (strTitle.isEmpty())
|
||||
{
|
||||
QWebElement elementTitle = Find(m_page->mainFrame()->documentElement(),"div","class","end_ct_area");
|
||||
strTitle = Find(elementTitle,"p","class","end_tit").toPlainText();
|
||||
}
|
||||
//entertainment
|
||||
if (strDate.isEmpty()) strDate = Find(element,"em").toPlainText();
|
||||
if (strData.isEmpty()) strData = Find(m_page->mainFrame()->documentElement(),"div","id","articeBody").toPlainText();
|
||||
|
||||
if (strlike.isEmpty() == false) strData += "\r\nlike(" + QString::number(strlike.toInt()) + ")";
|
||||
else
|
||||
{
|
||||
Debug("out.html",m_page->mainFrame()->toHtml());
|
||||
}
|
||||
|
||||
element = Find(m_page->mainFrame()->documentElement(),"div","class","press_logo");
|
||||
{
|
||||
strPlatID = Find(element,"a").attribute("href");
|
||||
strPlatTitle = Find(element,"img").attribute("alt");
|
||||
QStringList strlistPlat = strPlatID.split(".");
|
||||
if(strlistPlat.size() > 2)
|
||||
{
|
||||
if (strlistPlat.at(0) == QString("http://www"))
|
||||
strPlatID = strlistPlat.at(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//platform_title,platform_id
|
||||
|
||||
m_data.deleteDB(m_strUrl,SCrawlerData::ARTICLE_URL);
|
||||
m_data.setData(m_data.GetSafeUtf(strTitle), SCrawlerData::ARTICLE_TITLE);
|
||||
m_data.setData(m_data.GetSafeUtf(strData), SCrawlerData::ARTICLE_DATA);
|
||||
m_data.setData(strPlatID,SCrawlerData::PLATFORM_ID);
|
||||
m_data.setData(strPlatTitle,SCrawlerData::PLATFORM_TITLE);
|
||||
m_data.setData(strDate, SCrawlerData::ARTICLE_DATE);
|
||||
m_data.setData("naver", SCrawlerData::PLATFORM_NAME);
|
||||
m_data.setData("news", SCrawlerData::PLATFORM_FORM);
|
||||
m_data.setData("body", SCrawlerData::ARTICLE_FORM);
|
||||
m_data.sendDB();
|
||||
saveFrame(m_page->mainFrame());
|
||||
m_strUrl.split("&");
|
||||
m_reply.SetUrl(m_strUrl);
|
||||
m_reply.Start(&m_data);
|
||||
if (m_bUse)
|
||||
cout << "ok";
|
||||
else
|
||||
cout << "fail";
|
||||
emit finished();
|
||||
}
|
||||
}
|
||||
|
||||
void SNewsCrawler::saveFrame(QWebFrame *frame)
|
||||
{
|
||||
if (m_bUse) return;
|
||||
if (frame->frameName() == "ifrMemo")
|
||||
{
|
||||
m_reply.SetTotal(Find(frame->documentElement(),"strong","class","_totalcount").toPlainText().trimmed().replace(",","").toInt());
|
||||
m_bUse = true;
|
||||
}
|
||||
foreach(QWebFrame *childFrame, frame->childFrames())
|
||||
saveFrame(childFrame);
|
||||
}
|
||||
|
||||
@@ -1,33 +0,0 @@
|
||||
#ifndef SNEWSCRAWLER_H
|
||||
#define SNEWSCRAWLER_H
|
||||
|
||||
#include <QWebPage>
|
||||
#include <QObject>
|
||||
#include <QThreadPool>
|
||||
|
||||
#include "sreplygetmanage.h"
|
||||
|
||||
class SNewsCrawler : public QObject
|
||||
{
|
||||
Q_OBJECT
|
||||
public:
|
||||
explicit SNewsCrawler(QObject *parent = 0);
|
||||
~SNewsCrawler();
|
||||
void load(QStringList _strlistArgv);
|
||||
void Debug(QString _strFilename,QString _strData);
|
||||
signals:
|
||||
void finished();
|
||||
private slots:
|
||||
void saveResult(bool ok);
|
||||
private:
|
||||
QWebPage *m_page;
|
||||
QString m_strUrl;
|
||||
SReplyGetManage m_reply;
|
||||
SCrawlerData m_data;
|
||||
bool m_bUse;
|
||||
private:
|
||||
QWebElement Find(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind);
|
||||
void saveFrame(QWebFrame *frame);
|
||||
};
|
||||
|
||||
#endif // SNEWSCRAWLER_H
|
||||
Reference in New Issue
Block a user