git-svn-id: svn://192.168.0.12/source@164 8346c931-da38-4b9b-9d4c-e48b93cbd075

This commit is contained in:
admin
2015-07-08 08:15:02 +00:00
parent da5a2d3843
commit 86f013b167
2 changed files with 0 additions and 189 deletions

View File

@@ -1,156 +0,0 @@
#include "snewscrawler.h"
#include <iostream>
#include <QNetworkRequest>
#include <QWebFrame>
#include <QWebElement>
#include <QWebElementCollection>
using namespace std;
#include <QFile>
#include <QTextStream>
void SNewsCrawler::Debug(QString _strFilename,QString _strData)
{
QFile file(_strFilename);
if (!file.open(QIODevice::WriteOnly | QIODevice::Text | QIODevice::Append))
return;
QTextStream out(&file);
out << _strData;
file.close();
}
SNewsCrawler::SNewsCrawler(QObject *parent) : QObject(parent) , m_bUse(false)
{
m_page = new QWebPage;
connect(m_page, SIGNAL(loadFinished(bool)), this, SLOT(saveResult(bool)));
}
SNewsCrawler::~SNewsCrawler()
{
}
void SNewsCrawler::load(QStringList _strlistArgv)
{
m_strUrl = _strlistArgv.at(0);
cout << m_strUrl.toStdString() << endl;
QUrl url = QUrl(m_strUrl);
if (url.scheme().isEmpty())
url.setScheme("http");
m_page->settings()->setAttribute(QWebSettings::AutoLoadImages,false);
QNetworkRequest *request = new QNetworkRequest;
request->setUrl(url);
m_data.setData(m_strUrl, SCrawlerData::ARTICLE_URL);
m_data.setTable("data_"+_strlistArgv.at(1));
m_data.setData(_strlistArgv.at(2), SCrawlerData::KEYWORD_ID);
/*
request->setRawHeader("Cache-Control","max-age=0, no-cache");
request->setRawHeader("Pragma","no-cache");
request->setRawHeader("Expires","Thu, 01 Jan 1970 16:00:00 GMT");
if (m_strReper.isEmpty() == false && m_nSelect == E_NAVER_CAFE_DATA)
request->setRawHeader("Referer",m_strReper.toLocal8Bit());
*/
request->setRawHeader("Accept-Language","ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4,zh-CN;q=0.2,zh;q=0.2");
m_page->mainFrame()->load(*request);
}
QWebElement SNewsCrawler::Find(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib="",const QString _strFind="")
{
QWebElementCollection elements = _FindElement.findAll(_strElement);
foreach (QWebElement element, elements)
{
if (element.attribute(_strAttrib) == _strFind)
{
return element;
}
}
QWebElement element;
return element;
}
void SNewsCrawler::saveResult(bool ok)
{
if (m_bUse) return;
if (!ok)
cout << "Failed loading";
else
{
QString strTitle,strDate,strData,strPlatID,strPlatTitle,strlike;
{
QWebElement element = Find(m_page->mainFrame()->documentElement(),"div","class","article_info");
{
strTitle = Find(element,"h3","id","articleTitle").toPlainText(); // Title;
strDate = Find(element,"span","class","t11").toPlainText(); // Date
}
strData = Find(m_page->mainFrame()->documentElement(),"div","id","articleBodyContents").toPlainText();
strlike = Find(m_page->mainFrame()->documentElement(),"div","class","u_likeit_module").toPlainText();
//entertainment
if (strTitle.isEmpty())
{
QWebElement elementTitle = Find(m_page->mainFrame()->documentElement(),"div","class","end_ct_area");
strTitle = Find(elementTitle,"p","class","end_tit").toPlainText();
}
//entertainment
if (strDate.isEmpty()) strDate = Find(element,"em").toPlainText();
if (strData.isEmpty()) strData = Find(m_page->mainFrame()->documentElement(),"div","id","articeBody").toPlainText();
if (strlike.isEmpty() == false) strData += "\r\nlike(" + QString::number(strlike.toInt()) + ")";
else
{
Debug("out.html",m_page->mainFrame()->toHtml());
}
element = Find(m_page->mainFrame()->documentElement(),"div","class","press_logo");
{
strPlatID = Find(element,"a").attribute("href");
strPlatTitle = Find(element,"img").attribute("alt");
QStringList strlistPlat = strPlatID.split(".");
if(strlistPlat.size() > 2)
{
if (strlistPlat.at(0) == QString("http://www"))
strPlatID = strlistPlat.at(1);
}
}
}
//platform_title,platform_id
m_data.deleteDB(m_strUrl,SCrawlerData::ARTICLE_URL);
m_data.setData(m_data.GetSafeUtf(strTitle), SCrawlerData::ARTICLE_TITLE);
m_data.setData(m_data.GetSafeUtf(strData), SCrawlerData::ARTICLE_DATA);
m_data.setData(strPlatID,SCrawlerData::PLATFORM_ID);
m_data.setData(strPlatTitle,SCrawlerData::PLATFORM_TITLE);
m_data.setData(strDate, SCrawlerData::ARTICLE_DATE);
m_data.setData("naver", SCrawlerData::PLATFORM_NAME);
m_data.setData("news", SCrawlerData::PLATFORM_FORM);
m_data.setData("body", SCrawlerData::ARTICLE_FORM);
m_data.sendDB();
saveFrame(m_page->mainFrame());
m_strUrl.split("&");
m_reply.SetUrl(m_strUrl);
m_reply.Start(&m_data);
if (m_bUse)
cout << "ok";
else
cout << "fail";
emit finished();
}
}
void SNewsCrawler::saveFrame(QWebFrame *frame)
{
if (m_bUse) return;
if (frame->frameName() == "ifrMemo")
{
m_reply.SetTotal(Find(frame->documentElement(),"strong","class","_totalcount").toPlainText().trimmed().replace(",","").toInt());
m_bUse = true;
}
foreach(QWebFrame *childFrame, frame->childFrames())
saveFrame(childFrame);
}

View File

@@ -1,33 +0,0 @@
#ifndef SNEWSCRAWLER_H
#define SNEWSCRAWLER_H
#include <QWebPage>
#include <QObject>
#include <QThreadPool>
#include "sreplygetmanage.h"
class SNewsCrawler : public QObject
{
Q_OBJECT
public:
explicit SNewsCrawler(QObject *parent = 0);
~SNewsCrawler();
void load(QStringList _strlistArgv);
void Debug(QString _strFilename,QString _strData);
signals:
void finished();
private slots:
void saveResult(bool ok);
private:
QWebPage *m_page;
QString m_strUrl;
SReplyGetManage m_reply;
SCrawlerData m_data;
bool m_bUse;
private:
QWebElement Find(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind);
void saveFrame(QWebFrame *frame);
};
#endif // SNEWSCRAWLER_H