Files
clients/AjaxCrawlerProcess/scrawler.cpp
admin cf56b35cd6 헤더 대소문자 수정
git-svn-id: svn://192.168.0.12/source@209 8346c931-da38-4b9b-9d4c-e48b93cbd075
2015-10-14 04:17:30 +00:00

446 lines
20 KiB
C++
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#include "scrawler.h"
#include <iostream>
#include <QNetworkRequest>
#include <QWebFrame>
#include <QWebElement>
#include <QWebElementCollection>
#include <QSqlQuery>
#include <QSqlError>
using namespace std;
#include <QFile>
#include <QTextStream>
#include "data.h"
void SCrawler::Debug(QString _strFilename,QString _strData)
{
QFile file(_strFilename);
if (!file.open(QIODevice::WriteOnly | QIODevice::Text | QIODevice::Append))
return;
QTextStream out(&file);
out << _strData;
file.close();
}
SCrawler::SCrawler(QObject *parent) : QObject(parent) , m_bUse(false)
{
m_page = new QWebPage;
connect(m_page, SIGNAL(loadFinished(bool)), this, SLOT(saveResult(bool)));
}
SCrawler::~SCrawler()
{
}
void SCrawler::load(QStringList _strlistArgv)
{
if (_strlistArgv.at(0) == "naver_news")
{
m_strUrl = _strlistArgv.at(1);
m_data.setData(m_strUrl, SCrawlerData::ARTICLE_URL);
m_reply.SetSelect(E_NAVER_NEWS);
}
if (_strlistArgv.at(0) == "daum_cafe")
{
m_strUrl = _strlistArgv.at(1);
m_reply.SetSelect(E_DAUM_CAFE);
QStringList strlist = m_strUrl.split("?");
if(strlist.size() > 1)
m_data.setData(strlist.at(0).trimmed(), SCrawlerData::ARTICLE_URL);
else
m_data.setData(m_strUrl, SCrawlerData::ARTICLE_URL);
}
cout << m_strUrl.toStdString() << endl;
QUrl url = QUrl(m_strUrl);
if (url.scheme().isEmpty())
url.setScheme("http");
m_page->settings()->setAttribute(QWebSettings::AutoLoadImages,false);
QNetworkRequest *request = new QNetworkRequest;
request->setUrl(url);
m_data.setTable("data_"+_strlistArgv.at(2));
m_data.setData(_strlistArgv.at(3), SCrawlerData::KEYWORD_ID);
request->setRawHeader("Accept-Language","ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4,zh-CN;q=0.2,zh;q=0.2");
m_page->mainFrame()->load(*request);
m_data.deleteDB(m_data.getData(SCrawlerData::ARTICLE_URL),SCrawlerData::ARTICLE_URL);
}
QWebElement SCrawler::Find(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib="",const QString _strFind="")
{
QWebElementCollection elements = _FindElement.findAll(_strElement);
foreach (QWebElement element, elements)
{
if (element.attribute(_strAttrib) == _strFind)
{
return element;
}
}
QWebElement element;
return element;
}
QWebElement SCrawler::FindMid(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind, const int _strStart, const int _strLength)
{
QWebElementCollection elements = _FindElement.findAll(_strElement);
foreach (QWebElement element, elements)
{
QString str = element.attribute(_strAttrib).trimmed().mid(_strStart,_strLength);
if (QString::compare(str,_strFind,Qt::CaseInsensitive)==0)
{
return element;
}
}
QWebElement element;
return element;
}
QWebElement SCrawler::FindMid(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind, const int _strStart)
{
int _strLength = _strFind.length();
return FindMid(_FindElement, _strElement, _strAttrib, _strFind, _strStart, _strLength);
}
QWebElement SCrawler::FindLeft(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind)
{
int _strStart = 0;
return FindMid(_FindElement, _strElement, _strAttrib, _strFind, _strStart);
}
QList<QWebElement> SCrawler::FindAllMid(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind, const int _strStart, const int _strLength)
{
QWebElementCollection elements = _FindElement.findAll(_strElement);
QList<QWebElement> returnElements = QList<QWebElement>();
foreach (QWebElement element, elements)
{
QString str = element.attribute(_strAttrib).trimmed().mid(_strStart,_strLength);
if (QString::compare(str,_strFind,Qt::CaseInsensitive)==0)
{
returnElements.append(element);
}
}
return returnElements;
}
void SCrawler::saveResult(bool ok)
{
if (m_bUse) return;
if (!ok)
cout << "Failed loading";
else
{
switch(m_reply.select())
{
case E_NAVER_NEWS:
saveResultNaverNews();
break;
case E_DAUM_CAFE:
saveResultDaumCafe();
break;
}
}
if (m_bUse)
cout << "ok";
else
cout << "fail";
emit finished();
}
void SCrawler::saveResultNaverNews()
{
QString strTitle,strDate,strData,strPlatID,strPlatTitle,strlike;
{
QWebElement element = Find(m_page->mainFrame()->documentElement(),"div","class","article_info");
{
strTitle = Find(element,"h3","id","articleTitle").toPlainText(); // Title;
strDate = Find(element,"span","class","t11").toPlainText(); // Date
}
strData = Find(m_page->mainFrame()->documentElement(),"div","id","articleBodyContents").toPlainText();
strlike = Find(m_page->mainFrame()->documentElement(),"div","class","u_likeit_module").toPlainText();
//entertainment
if (strTitle.isEmpty())
{
QWebElement elementTitle = Find(m_page->mainFrame()->documentElement(),"div","class","end_ct_area");
strTitle = Find(elementTitle,"p","class","end_tit").toPlainText();
}
//entertainment
if (strDate.isEmpty()) strDate = Find(element,"em").toPlainText();
if (strData.isEmpty()) strData = Find(m_page->mainFrame()->documentElement(),"div","id","articeBody").toPlainText();
if (strlike.isEmpty() == false) strData += "\r\nlike(" + QString::number(strlike.toInt()) + ")";
else
{
Debug("out.html",m_page->mainFrame()->toHtml());
}
element = Find(m_page->mainFrame()->documentElement(),"div","class","press_logo");
{
strPlatID = Find(element,"a").attribute("href");
strPlatTitle = Find(element,"img").attribute("alt");
QStringList strlistPlat = strPlatID.split(".");
if(strlistPlat.size() > 2)
{
if (strlistPlat.at(0) == QString("http://www"))
strPlatID = strlistPlat.at(1);
}
}
}
m_data.setData(m_data.GetSafeUtf(strTitle), SCrawlerData::ARTICLE_TITLE);
m_data.setData(m_data.GetSafeUtf(strData), SCrawlerData::ARTICLE_DATA);
m_data.setData(strPlatID,SCrawlerData::PLATFORM_ID);
m_data.setData(strPlatTitle,SCrawlerData::PLATFORM_TITLE);
m_data.setData(strDate, SCrawlerData::ARTICLE_DATE);
m_data.setData("naver", SCrawlerData::PLATFORM_NAME);
m_data.setData("news", SCrawlerData::PLATFORM_FORM);
m_data.setData("body", SCrawlerData::ARTICLE_FORM);
m_data.sendDB();
saveFrameNaverNews(m_page->mainFrame());
m_reply.SetUrl(m_strUrl);
m_reply.Start(&m_data);
}
void SCrawler::saveFrameNaverNews(QWebFrame *frame)
{
if (m_bUse) return;
if (frame->frameName() == "ifrMemo")
{
m_reply.SetTotal(Find(frame->documentElement(),"strong","class","_totalcount").toPlainText().trimmed().replace(",","").toInt());
m_bUse = true;
}
foreach(QWebFrame *childFrame, frame->childFrames())
saveFrameNaverNews(childFrame);
}
void SCrawler::saveResultDaumCafe()
{
saveFrameDaumCafe(m_page->mainFrame());
m_data.sendDB();
m_data.setData(QString(""),SCrawlerData::ETC);
m_reply.Start(&m_data);
}
void SCrawler::saveFrameDaumCafe(QWebFrame *frame)
{
if (m_bUse) return;
QWebElement other = frame->documentElement().findFirst("title");
QString strTitle = other.toPlainText().trimmed().split("|").at(0).trimmed();
QString strUrl_;
if (strTitle.isEmpty() == false)
{
QStringList strlist = m_strUrl.split("?");
if(strlist.size() > 1)
{
m_data.setData(strlist.at(0).trimmed(), SCrawlerData::ARTICLE_URL);
strUrl_ = strlist.at(0).trimmed();
}
else
{
m_data.setData(m_strUrl, SCrawlerData::ARTICLE_URL);
strUrl_ = m_strUrl;
}
m_data.setData(m_data.SqlString(m_data.GetSafeUtf(strTitle)), SCrawlerData::PLATFORM_TITLE);
}
if (frame->frameName() == "down")
{
m_reply.SetDaumData(SReplyGetManage::E_DAUM_CDEPTH,Find(frame->documentElement(),"input","name","F_CDEPTH").attribute("value").trimmed());
m_data.setData(Find(frame->documentElement(),"input","name","grpid").attribute("value"),SCrawlerData::ETC);
;
QString strHits;
{
QString strData,strDate,strNick,strID,strTitle;
{
QWebElement group = Find(frame->documentElement(),"div","class","bbs_contents");
strData = m_data.SqlString(group.toPlainText().trimmed());
strData = m_data.GetSafeUtf(strData);
}
{
QWebElement group = Find(frame->documentElement(),"span","class","p11 ls0");
strDate = group.toPlainText().trimmed().replace(".","-");
strDate = strDate.replace("- "," ");
if (strDate.isEmpty() == true)
strDate = Find(frame->documentElement(),"input","name","PLAIN_REGDT").attribute("value");
else
strDate += ":00";
}
{
QWebElement group = Find(frame->documentElement(),"div","class","subject");
QWebElement group2 = Find(group,"span","class","b");
strTitle = m_data.SqlString(group2.toPlainText().trimmed());
}
{
QWebElement group = Find(Find(frame->documentElement(),"div","class","article_writer"),"a","href","#");
strNick = group.toPlainText().trimmed();
QWebElement id = Find(frame->documentElement(),"div","class","article_writer");
QStringList list = FindLeft(id,"a","onclick","showSideView").attribute("onclick").trimmed().split(",");
if (list.size() >= 2)
strID = list.at(1).trimmed().replace("'","");
}
QStringList strList = Find(frame->documentElement(),"div","class","article_writer").toPlainText().split("|");
foreach(QString str,strList)
{
QStringList substrList = str.split(" ");
for(int i = 0;i < substrList.size();i++)
{
if((substrList.at(i).trimmed().compare("조회") == 0) || (substrList.at(i).trimmed().compare("\"조회\"") == 0))
{
strHits = substrList.at(i+1).trimmed();
break;
}
}
}
{
m_data.setData(strData, SCrawlerData::ARTICLE_DATA);
m_data.setData(strDate, SCrawlerData::ARTICLE_DATE);
m_data.setData(strNick, SCrawlerData::ARTICLE_NICKNAME);
if(!strID.isEmpty())
m_data.setData(strID, SCrawlerData::ARTICLE_ID);
m_data.setData(strHits, SCrawlerData::ARTICLE_HIT);
m_data.setData("daum", SCrawlerData::PLATFORM_NAME);
m_data.setData("cafe", SCrawlerData::PLATFORM_FORM);
m_data.setData("body", SCrawlerData::ARTICLE_FORM);
m_data.setData(m_strUrl.split("/").at(3), SCrawlerData::PLATFORM_ID);
m_data.setData(strTitle, SCrawlerData::ARTICLE_TITLE);
}
}
// Comment
{
QWebElement group = Find(frame->documentElement(),"div","class","commentDiv bg_sub");
QString strNumber = Find(Find(frame->documentElement(),"div","class","paging"),"a","onclick","return false;").toPlainText().trimmed();
m_reply.SetDaumData(SReplyGetManage::E_DAUM_TOTAL,strNumber);
if (strNumber.isEmpty()) strNumber = "1";
QList<QWebElement> elements = FindAllMid(group,"div","id","_cmt-",0,5);
QString commHidden = "comment_hidden";
QString commPos = "comment_pos";
QString commReComm = "recomment_pos";
QString strParent;
int nCount = (strNumber.toInt() - 1) * 50;
foreach (QWebElement element, elements)
{
if (element.attribute("class").trimmed().right(commHidden.length()).compare(commHidden,Qt::CaseInsensitive) != 0){
if (element.attribute("class").trimmed().left(commPos.length()).compare(commPos,Qt::CaseInsensitive) == 0)
{
QString strData = m_data.SqlString(Find(element,"span","class","comment_contents").toPlainText().trimmed());
if (strData.isEmpty()) continue;
strData = m_data.GetSafeUtf(strData);
QString strID;
QStringList strListID = Find(element,"a","class","b").attribute("onclick").split(",");
if(strListID.length() > 2)
strID = strListID.at(1).trimmed().replace("'","");
QString strNick = Find(element,"a","class","b").toPlainText().trimmed();
strParent = strNick;
QString strDatetest = Find(element,"span","class","comment_date txt_sub p11 ls0").toPlainText().trimmed();
QString strDate;
if(strDatetest.count(".") == 0)
{
strDate = QDateTime::currentDateTime().toString("yyyy-MM-dd");
strDate += (" " + strDatetest + ":00");
QDateTime getTime = QDateTime::fromString(strDate, "yyyy-MM-dd hh:mm:ss");
QDateTime nowTime = QDateTime::currentDateTime();
if(getTime > nowTime)
{
getTime.addDays(-1);
strDate = getTime.toString("yyyy-MM-dd hh:mm:ss");
}
}
else
{
strDate = Find(element,"span","class","comment_date txt_sub p11 ls0").toPlainText().trimmed().replace(".","-").replace("- "," ");
}
if (strDate.isEmpty()) continue;
QSqlQuery query;
query.prepare(QString("insert into " + m_data.GetTable() + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,article_order,platform_id,article_hit,platform_title) VALUES ('daum','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:ROWNUM,:PLATFORMID,:HITS,:TITLE)").toUtf8());
query.bindValue(":URL",strUrl_.toUtf8());
query.bindValue(":ID",strID.toUtf8());
query.bindValue(":NICK",strNick.toUtf8());
query.bindValue(":DATA",strData.toUtf8());
query.bindValue(":DATE",strDate.toUtf8());
query.bindValue(":ROWNUM",nCount++);
query.bindValue(":PLATFORMID",m_strUrl.split("/").at(3).toUtf8());
query.bindValue(":HITS",strHits.toUtf8());
query.bindValue(":TITLE",strTitle.toUtf8());
if (query.exec()==false)
cout << "error : " << query.lastError().text().toStdString();
}
if (element.attribute("class").trimmed().left(commReComm.length()).compare(commReComm,Qt::CaseInsensitive) == 0)
{
QString strData = m_data.SqlString(Find(element,"span","class","comment_contents").toPlainText().trimmed());
if (strData.isEmpty()) continue;
QString strReParent = Find(element,"span","class","mention_nicknames text_counter txt_point b").toPlainText().trimmed();
if(strReParent.length() == 0)
strReParent = strParent;
QString strID;
QStringList strListID = Find(element,"a","class","b").attribute("onclick").split(",");
if(strListID.length() > 2)
strID = strListID.at(1).trimmed().replace("'","");
QString strNick = Find(element,"a","class","b").toPlainText().trimmed();
QString strDatetest = Find(element,"span","class","comment_date txt_sub p11 ls0").toPlainText().trimmed();
QString strDate;
if(strDatetest.count(".") == 0)
{
strDate = QDateTime::currentDateTime().toString("yyyy-MM-dd");
strDate += (" " + strDatetest + ":00");
QDateTime getTime = QDateTime::fromString(strDate, "yyyy-MM-dd hh:mm:ss");
QDateTime nowTime = QDateTime::currentDateTime();
if(getTime > nowTime)
{
getTime.addDays(-1);
strDate = getTime.toString("yyyy-MM-dd hh:mm:ss");
}
}
else
{
strDate = Find(element,"span","class","comment_date txt_sub p11 ls0").toPlainText().trimmed().replace(".","-").replace("- "," ");
}
if (strDate.isEmpty()) continue;
QSqlQuery query;
query.prepare(QString("insert into " + m_data.GetTable() + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,article_parent,article_order,platform_id,article_hit,platform_title) VALUES ('daum','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:PARENT,:ROWNUM,:PLATFORMID,:HITS,:TITLE)").toUtf8());
query.bindValue(":URL",strUrl_.toUtf8());
query.bindValue(":ID",strID.toUtf8());
query.bindValue(":NICK",strNick.toUtf8());
query.bindValue(":DATA",strData.toUtf8());
query.bindValue(":DATE",strDate.toUtf8());
query.bindValue(":PARENT",strReParent.toUtf8());
//query.bindValue(":URLREPLY",m_strReper.toUtf8());
query.bindValue(":ROWNUM",nCount++);
query.bindValue(":PLATFORMID",m_strUrl.split("/").at(3).toUtf8());
query.bindValue(":HITS",strHits.toUtf8());
query.bindValue(":TITLE",strTitle.toUtf8());
//QWebView::page()->mainFrame()->evaluateJavaScript("");
if (query.exec()==false)
cout << "error : " << query.lastError().text().toStdString();
}
}
}
}
m_bUse = true;
}
else
m_reply.SetDaumData(SReplyGetManage::E_DAUM_DOWNSRC, Find(frame->documentElement(),"frame","name","down").attribute("src").trimmed());
foreach(QWebFrame *childFrame, frame->childFrames())
saveFrameDaumCafe(childFrame);
}