445 lines
20 KiB
C++
445 lines
20 KiB
C++
#include "scrawler.h"
|
||
|
||
#include <iostream>
|
||
#include <QNetworkRequest>
|
||
#include <QWebFrame>
|
||
#include <QWebElement>
|
||
#include <QWebElementCollection>
|
||
#include <QSqlQuery>
|
||
#include <QSqlError>
|
||
|
||
using namespace std;
|
||
|
||
#include <QFile>
|
||
#include <QTextStream>
|
||
|
||
#include "data.h"
|
||
|
||
void SCrawler::Debug(QString _strFilename,QString _strData)
|
||
{
|
||
QFile file(_strFilename);
|
||
if (!file.open(QIODevice::WriteOnly | QIODevice::Text | QIODevice::Append))
|
||
return;
|
||
QTextStream out(&file);
|
||
out << _strData;
|
||
file.close();
|
||
}
|
||
|
||
SCrawler::SCrawler(QObject *parent) : QObject(parent) , m_bUse(false)
|
||
{
|
||
m_page = new QWebPage;
|
||
connect(m_page, SIGNAL(loadFinished(bool)), this, SLOT(saveResult(bool)));
|
||
}
|
||
|
||
SCrawler::~SCrawler()
|
||
{
|
||
|
||
}
|
||
|
||
void SCrawler::load(QStringList _strlistArgv)
|
||
{
|
||
if (_strlistArgv.at(0) == "naver_news")
|
||
{
|
||
m_strUrl = _strlistArgv.at(1);
|
||
m_data.setData(m_strUrl, SCrawlerData::ARTICLE_URL);
|
||
m_reply.SetSelect(E_NAVER_NEWS);
|
||
}
|
||
if (_strlistArgv.at(0) == "daum_cafe")
|
||
{
|
||
m_strUrl = _strlistArgv.at(1);
|
||
m_reply.SetSelect(E_DAUM_CAFE);
|
||
QStringList strlist = m_strUrl.split("?");
|
||
if(strlist.size() > 1)
|
||
m_data.setData(strlist.at(0).trimmed(), SCrawlerData::ARTICLE_URL);
|
||
else
|
||
m_data.setData(m_strUrl, SCrawlerData::ARTICLE_URL);
|
||
}
|
||
cout << m_strUrl.toStdString() << endl;
|
||
QUrl url = QUrl(m_strUrl);
|
||
if (url.scheme().isEmpty())
|
||
url.setScheme("http");
|
||
m_page->settings()->setAttribute(QWebSettings::AutoLoadImages,false);
|
||
QNetworkRequest *request = new QNetworkRequest;
|
||
request->setUrl(url);
|
||
m_data.setTable("data_"+_strlistArgv.at(2));
|
||
m_data.setData(_strlistArgv.at(3), SCrawlerData::KEYWORD_ID);
|
||
request->setRawHeader("Accept-Language","ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4,zh-CN;q=0.2,zh;q=0.2");
|
||
m_page->mainFrame()->load(*request);
|
||
|
||
m_data.deleteDB(m_data.getData(SCrawlerData::ARTICLE_URL),SCrawlerData::ARTICLE_URL);
|
||
}
|
||
|
||
QWebElement SCrawler::Find(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib="",const QString _strFind="")
|
||
{
|
||
QWebElementCollection elements = _FindElement.findAll(_strElement);
|
||
foreach (QWebElement element, elements)
|
||
{
|
||
if (element.attribute(_strAttrib) == _strFind)
|
||
{
|
||
return element;
|
||
}
|
||
}
|
||
QWebElement element;
|
||
return element;
|
||
}
|
||
|
||
QWebElement SCrawler::FindMid(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind, const int _strStart, const int _strLength)
|
||
{
|
||
QWebElementCollection elements = _FindElement.findAll(_strElement);
|
||
foreach (QWebElement element, elements)
|
||
{
|
||
QString str = element.attribute(_strAttrib).trimmed().mid(_strStart,_strLength);
|
||
if (QString::compare(str,_strFind,Qt::CaseInsensitive)==0)
|
||
{
|
||
return element;
|
||
}
|
||
}
|
||
QWebElement element;
|
||
return element;
|
||
}
|
||
|
||
QWebElement SCrawler::FindMid(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind, const int _strStart)
|
||
{
|
||
int _strLength = _strFind.length();
|
||
return FindMid(_FindElement, _strElement, _strAttrib, _strFind, _strStart, _strLength);
|
||
}
|
||
|
||
QWebElement SCrawler::FindLeft(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind)
|
||
{
|
||
int _strStart = 0;
|
||
return FindMid(_FindElement, _strElement, _strAttrib, _strFind, _strStart);
|
||
}
|
||
|
||
QList<QWebElement> SCrawler::FindAllMid(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind, const int _strStart, const int _strLength)
|
||
{
|
||
QWebElementCollection elements = _FindElement.findAll(_strElement);
|
||
QList<QWebElement> returnElements = QList<QWebElement>();
|
||
|
||
foreach (QWebElement element, elements)
|
||
{
|
||
QString str = element.attribute(_strAttrib).trimmed().mid(_strStart,_strLength);
|
||
if (QString::compare(str,_strFind,Qt::CaseInsensitive)==0)
|
||
{
|
||
returnElements.append(element);
|
||
}
|
||
}
|
||
return returnElements;
|
||
}
|
||
|
||
|
||
void SCrawler::saveResult(bool ok)
|
||
{
|
||
if (m_bUse) return;
|
||
if (!ok)
|
||
cout << "Failed loading";
|
||
else
|
||
{
|
||
switch(m_reply.select())
|
||
{
|
||
case E_NAVER_NEWS:
|
||
saveResultNaverNews();
|
||
break;
|
||
|
||
case E_DAUM_CAFE:
|
||
saveResultDaumCafe();
|
||
break;
|
||
}
|
||
}
|
||
if (m_bUse)
|
||
cout << "ok";
|
||
else
|
||
cout << "fail";
|
||
emit finished();
|
||
}
|
||
|
||
void SCrawler::saveResultNaverNews()
|
||
{
|
||
QString strTitle,strDate,strData,strPlatID,strPlatTitle,strlike;
|
||
{
|
||
QWebElement element = Find(m_page->mainFrame()->documentElement(),"div","class","article_info");
|
||
{
|
||
strTitle = Find(element,"h3","id","articleTitle").toPlainText(); // Title;
|
||
strDate = Find(element,"span","class","t11").toPlainText(); // Date
|
||
}
|
||
strData = Find(m_page->mainFrame()->documentElement(),"div","id","articleBodyContents").toPlainText();
|
||
strlike = Find(m_page->mainFrame()->documentElement(),"div","class","u_likeit_module").toPlainText();
|
||
//entertainment
|
||
if (strTitle.isEmpty())
|
||
{
|
||
QWebElement elementTitle = Find(m_page->mainFrame()->documentElement(),"div","class","end_ct_area");
|
||
strTitle = Find(elementTitle,"p","class","end_tit").toPlainText();
|
||
}
|
||
//entertainment
|
||
if (strDate.isEmpty()) strDate = Find(element,"em").toPlainText();
|
||
if (strData.isEmpty()) strData = Find(m_page->mainFrame()->documentElement(),"div","id","articeBody").toPlainText();
|
||
|
||
if (strlike.isEmpty() == false) strData += "\r\nlike(" + QString::number(strlike.toInt()) + ")";
|
||
else
|
||
{
|
||
//Debug("out.html",m_page->mainFrame()->toHtml());
|
||
}
|
||
|
||
element = Find(m_page->mainFrame()->documentElement(),"div","class","press_logo");
|
||
{
|
||
strPlatID = Find(element,"a").attribute("href");
|
||
strPlatTitle = Find(element,"img").attribute("alt");
|
||
QStringList strlistPlat = strPlatID.split(".");
|
||
if(strlistPlat.size() > 2)
|
||
{
|
||
if (strlistPlat.at(0) == QString("http://www"))
|
||
strPlatID = strlistPlat.at(1);
|
||
}
|
||
}
|
||
}
|
||
m_data.setData(m_data.GetSafeUtf(strTitle), SCrawlerData::ARTICLE_TITLE);
|
||
m_data.setData(m_data.GetSafeUtf(strData), SCrawlerData::ARTICLE_DATA);
|
||
m_data.setData(strPlatID,SCrawlerData::PLATFORM_ID);
|
||
m_data.setData(strPlatTitle,SCrawlerData::PLATFORM_TITLE);
|
||
m_data.setData(strDate, SCrawlerData::ARTICLE_DATE);
|
||
m_data.setData("naver", SCrawlerData::PLATFORM_NAME);
|
||
m_data.setData("news", SCrawlerData::PLATFORM_FORM);
|
||
m_data.setData("body", SCrawlerData::ARTICLE_FORM);
|
||
m_data.sendDB();
|
||
saveFrameNaverNews(m_page->mainFrame());
|
||
m_reply.SetUrl(m_strUrl);
|
||
m_reply.Start(&m_data);
|
||
}
|
||
|
||
void SCrawler::saveFrameNaverNews(QWebFrame *frame)
|
||
{
|
||
if (m_bUse) return;
|
||
if (frame->frameName() == "ifrMemo")
|
||
{
|
||
m_reply.SetTotal(Find(frame->documentElement(),"strong","class","_totalcount").toPlainText().trimmed().replace(",","").toInt());
|
||
m_bUse = true;
|
||
}
|
||
foreach(QWebFrame *childFrame, frame->childFrames())
|
||
saveFrameNaverNews(childFrame);
|
||
}
|
||
|
||
void SCrawler::saveResultDaumCafe()
|
||
{
|
||
saveFrameDaumCafe(m_page->mainFrame());
|
||
m_data.sendDB();
|
||
m_data.setData(QString(""),SCrawlerData::ETC);
|
||
m_reply.Start(&m_data);
|
||
}
|
||
|
||
void SCrawler::saveFrameDaumCafe(QWebFrame *frame)
|
||
{
|
||
if (m_bUse) return;
|
||
|
||
QWebElement other = frame->documentElement().findFirst("title");
|
||
QString strTitle = other.toPlainText().trimmed().split("|").at(0).trimmed();
|
||
QString strUrl_;
|
||
if (strTitle.isEmpty() == false)
|
||
{
|
||
QStringList strlist = m_strUrl.split("?");
|
||
if(strlist.size() > 1)
|
||
{
|
||
m_data.setData(strlist.at(0).trimmed(), SCrawlerData::ARTICLE_URL);
|
||
strUrl_ = strlist.at(0).trimmed();
|
||
}
|
||
else
|
||
{
|
||
m_data.setData(m_strUrl, SCrawlerData::ARTICLE_URL);
|
||
strUrl_ = m_strUrl;
|
||
}
|
||
m_data.setData(m_data.SqlString(m_data.GetSafeUtf(strTitle)), SCrawlerData::PLATFORM_TITLE);
|
||
}
|
||
|
||
if (frame->frameName() == "down")
|
||
{
|
||
m_reply.SetDaumData(SReplyGetManage::E_DAUM_CDEPTH,Find(frame->documentElement(),"input","name","F_CDEPTH").attribute("value").trimmed());
|
||
m_data.setData(Find(frame->documentElement(),"input","name","grpid").attribute("value"),SCrawlerData::ETC);
|
||
;
|
||
QString strHits;
|
||
{
|
||
QString strData,strDate,strNick,strID,strTitle;
|
||
{
|
||
QWebElement group = Find(frame->documentElement(),"div","class","bbs_contents");
|
||
strData = m_data.SqlString(group.toPlainText().trimmed());
|
||
strData = m_data.GetSafeUtf(strData);
|
||
}
|
||
{
|
||
QWebElement group = Find(frame->documentElement(),"span","class","p11 ls0");
|
||
strDate = group.toPlainText().trimmed().replace(".","-");
|
||
strDate = strDate.replace("- "," ");
|
||
if (strDate.isEmpty() == true)
|
||
strDate = Find(frame->documentElement(),"input","name","PLAIN_REGDT").attribute("value");
|
||
else
|
||
strDate += ":00";
|
||
}
|
||
{
|
||
QWebElement group = Find(frame->documentElement(),"div","class","subject");
|
||
QWebElement group2 = Find(group,"span","class","b");
|
||
strTitle = m_data.SqlString(group2.toPlainText().trimmed());
|
||
}
|
||
|
||
{
|
||
QWebElement group = Find(Find(frame->documentElement(),"div","class","article_writer"),"a","href","#");
|
||
strNick = group.toPlainText().trimmed();
|
||
|
||
QWebElement id = Find(frame->documentElement(),"div","class","article_writer");
|
||
QStringList list = FindLeft(id,"a","onclick","showSideView").attribute("onclick").trimmed().split(",");
|
||
if (list.size() >= 2)
|
||
strID = list.at(1).trimmed().replace("'","");
|
||
}
|
||
|
||
QStringList strList = Find(frame->documentElement(),"div","class","article_writer").toPlainText().split("|");
|
||
|
||
foreach(QString str,strList)
|
||
{
|
||
QStringList substrList = str.split(" ");
|
||
for(int i = 0;i < substrList.size();i++)
|
||
{
|
||
if((substrList.at(i).trimmed().compare("조회") == 0) || (substrList.at(i).trimmed().compare("\"조회\"") == 0))
|
||
{
|
||
strHits = substrList.at(i+1).trimmed();
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
{
|
||
m_data.setData(strData, SCrawlerData::ARTICLE_DATA);
|
||
m_data.setData(strDate, SCrawlerData::ARTICLE_DATE);
|
||
m_data.setData(strNick, SCrawlerData::ARTICLE_NICKNAME);
|
||
if(!strID.isEmpty())
|
||
m_data.setData(strID, SCrawlerData::ARTICLE_ID);
|
||
m_data.setData(strHits, SCrawlerData::ARTICLE_HIT);
|
||
m_data.setData("daum", SCrawlerData::PLATFORM_NAME);
|
||
m_data.setData("cafe", SCrawlerData::PLATFORM_FORM);
|
||
m_data.setData("body", SCrawlerData::ARTICLE_FORM);
|
||
m_data.setData(m_strUrl.split("/").at(3), SCrawlerData::PLATFORM_ID);
|
||
m_data.setData(strTitle, SCrawlerData::ARTICLE_TITLE);
|
||
}
|
||
}
|
||
// Comment
|
||
{
|
||
QWebElement group = Find(frame->documentElement(),"div","class","commentDiv bg_sub");
|
||
QString strNumber = Find(Find(frame->documentElement(),"div","class","paging"),"a","onclick","return false;").toPlainText().trimmed();
|
||
m_reply.SetDaumData(SReplyGetManage::E_DAUM_TOTAL,strNumber);
|
||
if (strNumber.isEmpty()) strNumber = "1";
|
||
QList<QWebElement> elements = FindAllMid(group,"div","id","_cmt-",0,5);
|
||
QString commHidden = "comment_hidden";
|
||
QString commPos = "comment_pos";
|
||
QString commReComm = "recomment_pos";
|
||
QString strParent;
|
||
int nCount = (strNumber.toInt() - 1) * 50;
|
||
foreach (QWebElement element, elements)
|
||
{
|
||
if (element.attribute("class").trimmed().right(commHidden.length()).compare(commHidden,Qt::CaseInsensitive) != 0){
|
||
if (element.attribute("class").trimmed().left(commPos.length()).compare(commPos,Qt::CaseInsensitive) == 0)
|
||
{
|
||
QString strData = m_data.SqlString(Find(element,"span","class","comment_contents").toPlainText().trimmed());
|
||
if (strData.isEmpty()) continue;
|
||
strData = m_data.GetSafeUtf(strData);
|
||
|
||
QString strID;
|
||
QStringList strListID = Find(element,"a","class","b").attribute("onclick").split(",");
|
||
if(strListID.length() > 2)
|
||
strID = strListID.at(1).trimmed().replace("'","");
|
||
|
||
QString strNick = Find(element,"a","class","b").toPlainText().trimmed();
|
||
strParent = strNick;
|
||
|
||
QString strDatetest = Find(element,"span","class","comment_date txt_sub p11 ls0").toPlainText().trimmed();
|
||
QString strDate;
|
||
if(strDatetest.count(".") == 0)
|
||
{
|
||
strDate = QDateTime::currentDateTime().toString("yyyy-MM-dd");
|
||
strDate += (" " + strDatetest + ":00");
|
||
QDateTime getTime = QDateTime::fromString(strDate, "yyyy-MM-dd hh:mm:ss");
|
||
QDateTime nowTime = QDateTime::currentDateTime();
|
||
if(getTime > nowTime)
|
||
{
|
||
getTime.addDays(-1);
|
||
strDate = getTime.toString("yyyy-MM-dd hh:mm:ss");
|
||
}
|
||
}
|
||
else
|
||
{
|
||
strDate = Find(element,"span","class","comment_date txt_sub p11 ls0").toPlainText().trimmed().replace(".","-").replace("- "," ");
|
||
}
|
||
|
||
if (strDate.isEmpty()) continue;
|
||
QSqlQuery query;
|
||
|
||
query.prepare(QString("insert into " + m_data.GetTable() + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,article_order,platform_id,article_hit,platform_title) VALUES ('daum','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:ROWNUM,:PLATFORMID,:HITS,:TITLE)").toUtf8());
|
||
query.bindValue(":URL",strUrl_.toUtf8());
|
||
query.bindValue(":ID",strID.toUtf8());
|
||
query.bindValue(":NICK",strNick.toUtf8());
|
||
query.bindValue(":DATA",strData.toUtf8());
|
||
query.bindValue(":DATE",strDate.toUtf8());
|
||
query.bindValue(":ROWNUM",nCount++);
|
||
query.bindValue(":PLATFORMID",m_strUrl.split("/").at(3).toUtf8());
|
||
query.bindValue(":HITS",strHits.toUtf8());
|
||
query.bindValue(":TITLE",strTitle.toUtf8());
|
||
|
||
if (query.exec()==false)
|
||
cout << "error : " << query.lastError().text().toStdString();
|
||
}
|
||
if (element.attribute("class").trimmed().left(commReComm.length()).compare(commReComm,Qt::CaseInsensitive) == 0)
|
||
{
|
||
QString strData = m_data.SqlString(Find(element,"span","class","comment_contents").toPlainText().trimmed());
|
||
if (strData.isEmpty()) continue;
|
||
|
||
QString strReParent = Find(element,"span","class","mention_nicknames text_counter txt_point b").toPlainText().trimmed();
|
||
if(strReParent.length() == 0)
|
||
strReParent = strParent;
|
||
|
||
QString strID;
|
||
QStringList strListID = Find(element,"a","class","b").attribute("onclick").split(",");
|
||
if(strListID.length() > 2)
|
||
strID = strListID.at(1).trimmed().replace("'","");
|
||
|
||
QString strNick = Find(element,"a","class","b").toPlainText().trimmed();
|
||
QString strDatetest = Find(element,"span","class","comment_date txt_sub p11 ls0").toPlainText().trimmed();
|
||
QString strDate;
|
||
if(strDatetest.count(".") == 0)
|
||
{
|
||
strDate = QDateTime::currentDateTime().toString("yyyy-MM-dd");
|
||
strDate += (" " + strDatetest + ":00");
|
||
QDateTime getTime = QDateTime::fromString(strDate, "yyyy-MM-dd hh:mm:ss");
|
||
QDateTime nowTime = QDateTime::currentDateTime();
|
||
if(getTime > nowTime)
|
||
{
|
||
getTime.addDays(-1);
|
||
strDate = getTime.toString("yyyy-MM-dd hh:mm:ss");
|
||
}
|
||
}
|
||
else
|
||
{
|
||
strDate = Find(element,"span","class","comment_date txt_sub p11 ls0").toPlainText().trimmed().replace(".","-").replace("- "," ");
|
||
}
|
||
if (strDate.isEmpty()) continue;
|
||
QSqlQuery query;
|
||
query.prepare(QString("insert into " + m_data.GetTable() + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,article_parent,article_order,platform_id,article_hit,platform_title) VALUES ('daum','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:PARENT,:ROWNUM,:PLATFORMID,:HITS,:TITLE)").toUtf8());
|
||
query.bindValue(":URL",strUrl_.toUtf8());
|
||
query.bindValue(":ID",strID.toUtf8());
|
||
query.bindValue(":NICK",strNick.toUtf8());
|
||
query.bindValue(":DATA",strData.toUtf8());
|
||
query.bindValue(":DATE",strDate.toUtf8());
|
||
query.bindValue(":PARENT",strReParent.toUtf8());
|
||
//query.bindValue(":URLREPLY",m_strReper.toUtf8());
|
||
query.bindValue(":ROWNUM",nCount++);
|
||
query.bindValue(":PLATFORMID",m_strUrl.split("/").at(3).toUtf8());
|
||
query.bindValue(":HITS",strHits.toUtf8());
|
||
query.bindValue(":TITLE",strTitle.toUtf8());
|
||
//QWebView::page()->mainFrame()->evaluateJavaScript("");
|
||
if (query.exec()==false)
|
||
cout << "error : " << query.lastError().text().toStdString();
|
||
}
|
||
}
|
||
}
|
||
}
|
||
m_bUse = true;
|
||
}
|
||
else
|
||
m_reply.SetDaumData(SReplyGetManage::E_DAUM_DOWNSRC, Find(frame->documentElement(),"frame","name","down").attribute("src").trimmed());
|
||
|
||
foreach(QWebFrame *childFrame, frame->childFrames())
|
||
saveFrameDaumCafe(childFrame);
|
||
}
|
||
|