diff --git a/AjaxCrawlerProcess/snewscrawler.cpp b/AjaxCrawlerProcess/snewscrawler.cpp deleted file mode 100644 index d22a745..0000000 --- a/AjaxCrawlerProcess/snewscrawler.cpp +++ /dev/null @@ -1,156 +0,0 @@ -#include "snewscrawler.h" - -#include -#include -#include -#include -#include - -using namespace std; - -#include -#include - -void SNewsCrawler::Debug(QString _strFilename,QString _strData) -{ - QFile file(_strFilename); - if (!file.open(QIODevice::WriteOnly | QIODevice::Text | QIODevice::Append)) - return; - QTextStream out(&file); - out << _strData; - file.close(); -} - -SNewsCrawler::SNewsCrawler(QObject *parent) : QObject(parent) , m_bUse(false) -{ - m_page = new QWebPage; - connect(m_page, SIGNAL(loadFinished(bool)), this, SLOT(saveResult(bool))); -} - -SNewsCrawler::~SNewsCrawler() -{ - -} - -void SNewsCrawler::load(QStringList _strlistArgv) -{ - m_strUrl = _strlistArgv.at(0); - cout << m_strUrl.toStdString() << endl; - QUrl url = QUrl(m_strUrl); - if (url.scheme().isEmpty()) - url.setScheme("http"); - - m_page->settings()->setAttribute(QWebSettings::AutoLoadImages,false); - QNetworkRequest *request = new QNetworkRequest; - request->setUrl(url); - - m_data.setData(m_strUrl, SCrawlerData::ARTICLE_URL); - m_data.setTable("data_"+_strlistArgv.at(1)); - m_data.setData(_strlistArgv.at(2), SCrawlerData::KEYWORD_ID); - /* - request->setRawHeader("Cache-Control","max-age=0, no-cache"); - request->setRawHeader("Pragma","no-cache"); - request->setRawHeader("Expires","Thu, 01 Jan 1970 16:00:00 GMT"); - if (m_strReper.isEmpty() == false && m_nSelect == E_NAVER_CAFE_DATA) - request->setRawHeader("Referer",m_strReper.toLocal8Bit()); - */ - request->setRawHeader("Accept-Language","ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4,zh-CN;q=0.2,zh;q=0.2"); - m_page->mainFrame()->load(*request); - -} - -QWebElement SNewsCrawler::Find(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib="",const QString _strFind="") -{ - QWebElementCollection elements = _FindElement.findAll(_strElement); - foreach (QWebElement element, elements) - { - if (element.attribute(_strAttrib) == _strFind) - { - return element; - } - } - QWebElement element; - return element; -} - -void SNewsCrawler::saveResult(bool ok) -{ - if (m_bUse) return; - if (!ok) - cout << "Failed loading"; - else - { - QString strTitle,strDate,strData,strPlatID,strPlatTitle,strlike; - { - QWebElement element = Find(m_page->mainFrame()->documentElement(),"div","class","article_info"); - { - strTitle = Find(element,"h3","id","articleTitle").toPlainText(); // Title; - strDate = Find(element,"span","class","t11").toPlainText(); // Date - } - strData = Find(m_page->mainFrame()->documentElement(),"div","id","articleBodyContents").toPlainText(); - strlike = Find(m_page->mainFrame()->documentElement(),"div","class","u_likeit_module").toPlainText(); - //e​ntertainment - if (strTitle.isEmpty()) - { - QWebElement elementTitle = Find(m_page->mainFrame()->documentElement(),"div","class","end_ct_area"); - strTitle = Find(elementTitle,"p","class","end_tit").toPlainText(); - } - //e​ntertainment - if (strDate.isEmpty()) strDate = Find(element,"em").toPlainText(); - if (strData.isEmpty()) strData = Find(m_page->mainFrame()->documentElement(),"div","id","articeBody").toPlainText(); - - if (strlike.isEmpty() == false) strData += "\r\nlike(" + QString::number(strlike.toInt()) + ")"; - else - { - Debug("out.html",m_page->mainFrame()->toHtml()); - } - - element = Find(m_page->mainFrame()->documentElement(),"div","class","press_logo"); - { - strPlatID = Find(element,"a").attribute("href"); - strPlatTitle = Find(element,"img").attribute("alt"); - QStringList strlistPlat = strPlatID.split("."); - if(strlistPlat.size() > 2) - { - if (strlistPlat.at(0) == QString("http://www")) - strPlatID = strlistPlat.at(1); - } - } - } - - //platform_title,platform_id - - m_data.deleteDB(m_strUrl,SCrawlerData::ARTICLE_URL); - m_data.setData(m_data.GetSafeUtf(strTitle), SCrawlerData::ARTICLE_TITLE); - m_data.setData(m_data.GetSafeUtf(strData), SCrawlerData::ARTICLE_DATA); - m_data.setData(strPlatID,SCrawlerData::PLATFORM_ID); - m_data.setData(strPlatTitle,SCrawlerData::PLATFORM_TITLE); - m_data.setData(strDate, SCrawlerData::ARTICLE_DATE); - m_data.setData("naver", SCrawlerData::PLATFORM_NAME); - m_data.setData("news", SCrawlerData::PLATFORM_FORM); - m_data.setData("body", SCrawlerData::ARTICLE_FORM); - m_data.sendDB(); - saveFrame(m_page->mainFrame()); - m_strUrl.split("&"); - m_reply.SetUrl(m_strUrl); - m_reply.Start(&m_data); - if (m_bUse) - cout << "ok"; - else - cout << "fail"; - emit finished(); - } -} - -void SNewsCrawler::saveFrame(QWebFrame *frame) -{ - if (m_bUse) return; - if (frame->frameName() == "ifrMemo") - { - m_reply.SetTotal(Find(frame->documentElement(),"strong","class","_totalcount").toPlainText().trimmed().replace(",","").toInt()); - m_bUse = true; - } - foreach(QWebFrame *childFrame, frame->childFrames()) - saveFrame(childFrame); -} - diff --git a/AjaxCrawlerProcess/snewscrawler.h b/AjaxCrawlerProcess/snewscrawler.h deleted file mode 100644 index c7dc4d1..0000000 --- a/AjaxCrawlerProcess/snewscrawler.h +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef SNEWSCRAWLER_H -#define SNEWSCRAWLER_H - -#include -#include -#include - -#include "sreplygetmanage.h" - -class SNewsCrawler : public QObject -{ - Q_OBJECT -public: - explicit SNewsCrawler(QObject *parent = 0); - ~SNewsCrawler(); - void load(QStringList _strlistArgv); - void Debug(QString _strFilename,QString _strData); -signals: - void finished(); -private slots: - void saveResult(bool ok); -private: - QWebPage *m_page; - QString m_strUrl; - SReplyGetManage m_reply; - SCrawlerData m_data; - bool m_bUse; -private: - QWebElement Find(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind); - void saveFrame(QWebFrame *frame); -}; - -#endif // SNEWSCRAWLER_H