762 lines
30 KiB
C++
762 lines
30 KiB
C++
#include "scrawler.h"
|
|
#include <iostream>
|
|
#include <QSqlQuery>
|
|
#include <QSqlError>
|
|
#include <QByteArray>
|
|
#include <qDebug>
|
|
|
|
using namespace std;
|
|
|
|
struct SProxyList
|
|
{
|
|
QString m_strAddress;
|
|
int m_nPort;
|
|
};
|
|
|
|
SCrawler::SCrawler():QObject()
|
|
{
|
|
m_page = new QWebPage;
|
|
connect(m_page, SIGNAL(loadFinished(bool)), this, SLOT(saveResult(bool)));
|
|
}
|
|
|
|
SCrawler::~SCrawler()
|
|
{
|
|
}
|
|
|
|
void SCrawler::load(QStringList _strlistArgv)
|
|
{
|
|
m_bUse = false;
|
|
|
|
if (_strlistArgv[0] == "naver")
|
|
{
|
|
if (_strlistArgv[1] == "cafe_list")
|
|
{
|
|
m_strUrl = _strlistArgv[2];
|
|
m_nSelect = E_NAVER_CAFE_LIST;
|
|
m_strKeywordID = _strlistArgv[4];
|
|
|
|
QFile file("proxy.txt");
|
|
if (file.open(QIODevice::ReadOnly | QIODevice::Text))
|
|
{
|
|
QVector <QStringList> vecProxy;
|
|
while (!file.atEnd())
|
|
{
|
|
QString str = QString(file.readLine());
|
|
if (str.isEmpty()) continue;
|
|
vecProxy.push_back(str.split(","));
|
|
}
|
|
if (vecProxy.size() > 0)
|
|
{
|
|
QStringList strList = vecProxy.at(rand()%vecProxy.size());
|
|
switch(strList.size())
|
|
{
|
|
case 1:
|
|
cout << "p : " << strList.at(0).toStdString() << endl;
|
|
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0))));
|
|
break;
|
|
case 2:
|
|
cout << "p : " << strList.at(0).toStdString() << endl;
|
|
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt())));
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (_strlistArgv[1] == "cafe_data")
|
|
{
|
|
m_strUrl = _strlistArgv[2];
|
|
m_nSelect = E_NAVER_CAFE_DATA;
|
|
m_strReper = _strlistArgv[4];
|
|
}
|
|
|
|
if (_strlistArgv[1] == "blog_list")
|
|
{
|
|
m_strUrl = _strlistArgv[2];
|
|
m_nSelect = E_NAVER_BLOG_LIST;
|
|
m_strKeywordID = _strlistArgv[4];
|
|
//cout << "ok";
|
|
}
|
|
|
|
if (_strlistArgv[1] == "blog_url")
|
|
{
|
|
m_strUrl = _strlistArgv[2];
|
|
m_nSelect = E_NAVER_BLOG_BODY;
|
|
//m_strReper = _strlistArgv[4];
|
|
m_bUse = true;
|
|
}
|
|
|
|
if (_strlistArgv[1] == "blog_comm")
|
|
{
|
|
m_strUrl = _strlistArgv[2];
|
|
m_nSelect = E_NAVER_BLOG_REPLY;
|
|
}
|
|
m_strTable = "data_" + _strlistArgv[3];
|
|
}
|
|
|
|
cout << m_strUrl.toStdString() << endl;
|
|
|
|
QUrl url = QUrl(m_strUrl);
|
|
if (url.scheme().isEmpty())
|
|
url.setScheme("http");
|
|
|
|
m_page->settings()->setAttribute(QWebSettings::AutoLoadImages,false);
|
|
QNetworkRequest *request = new QNetworkRequest;
|
|
request->setUrl(url);
|
|
/*
|
|
request->setRawHeader("Cache-Control","max-age=0, no-cache");
|
|
request->setRawHeader("Pragma","no-cache");
|
|
request->setRawHeader("Expires","Thu, 01 Jan 1970 16:00:00 GMT");
|
|
*/
|
|
if (m_strReper.isEmpty() == false && m_nSelect == E_NAVER_CAFE_DATA)
|
|
request->setRawHeader("Referer",m_strReper.toLocal8Bit());
|
|
request->setRawHeader("Accept-Language","ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4,zh-CN;q=0.2,zh;q=0.2");
|
|
m_page->mainFrame()->load(*request);
|
|
m_bLast = false;
|
|
m_bError = false;
|
|
}
|
|
|
|
void SCrawler::UpdateError(QString _strError)
|
|
{
|
|
/*
|
|
QSqlQuery sql;
|
|
_strError = "'" + _strError + "'";
|
|
QString strQuery = "update " + m_strTableBody + " set ERROR = " + _strError;
|
|
strQuery += "where URL='";
|
|
strQuery += m_strUrl;
|
|
strQuery += "'";
|
|
QString strUtf8(strQuery.toUtf8());
|
|
sql.exec(strUtf8);
|
|
*/
|
|
m_bError = true;
|
|
}
|
|
|
|
void SCrawler::saveResult(bool ok)
|
|
{
|
|
if (!ok)
|
|
{
|
|
std::cerr << "Failed loading " << qPrintable(m_page->mainFrame()->url().toString()) << std::endl;
|
|
emit finished();
|
|
return;
|
|
}
|
|
|
|
switch(m_nSelect)
|
|
{
|
|
case E_NAVER_CAFE_LIST:saveFrameCafeList(m_page->mainFrame());break;
|
|
case E_NAVER_CAFE_DATA:saveFrameCafeUrl(m_page->mainFrame());break;
|
|
case E_NAVER_BLOG_LIST:saveFrameList(m_page->mainFrame());break;
|
|
case E_NAVER_BLOG_BODY:saveFrameUrl(m_page->mainFrame());break;
|
|
case E_NAVER_BLOG_REPLY:saveFrameComment(m_page->mainFrame());break;
|
|
}
|
|
|
|
switch(m_nSelect)
|
|
{
|
|
case E_NAVER_CAFE_LIST:
|
|
case E_NAVER_BLOG_LIST:
|
|
if (m_bError) cout << "block";// block
|
|
if (m_bLast) cout << "last";
|
|
break;
|
|
case E_NAVER_BLOG_REPLY:
|
|
cout << "ok";
|
|
break;
|
|
case E_NAVER_CAFE_DATA:
|
|
case E_NAVER_BLOG_BODY:
|
|
if (m_bUse == false)
|
|
{
|
|
cout << "fail";
|
|
UpdateError("Error code 0");
|
|
}
|
|
else
|
|
{
|
|
if (m_bError == false)
|
|
{
|
|
cout << "ok";
|
|
UpdateError("ok");
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
emit finished();
|
|
}
|
|
|
|
int SCrawler::GetNumber(QString _str)
|
|
{
|
|
QString strNumber;
|
|
for (int i = 0; i < _str.size();i++)
|
|
{
|
|
if (_str.at(i).isNumber())
|
|
strNumber += _str.at(i);
|
|
}
|
|
return strNumber.toInt();
|
|
}
|
|
|
|
void SCrawler::Debug(QString _strFilename,QString _strData)
|
|
{
|
|
QFile file(_strFilename);
|
|
if (!file.open(QIODevice::WriteOnly | QIODevice::Text | QIODevice::Append))
|
|
return;
|
|
QTextStream out(&file);
|
|
out << _strData;
|
|
file.close();
|
|
}
|
|
|
|
QString SCrawler::SqlString(QString _str)
|
|
{
|
|
_str = _str.replace("'","\\'");
|
|
_str = _str.replace("\"","\\\"");
|
|
return _str;
|
|
}
|
|
|
|
|
|
QString SCrawler::GetSafeUtf(QString _strData)
|
|
{
|
|
QString str;
|
|
QChar *pch = _strData.data();
|
|
|
|
for (int i = 0; i < _strData.length(); i++)
|
|
{
|
|
if (pch[i].unicode() >= 12593 && pch[i].unicode() <= 12622)
|
|
str += pch[i];
|
|
if (pch[i].unicode() >= 44032 && pch[i].unicode() <= 55203)
|
|
str += pch[i];
|
|
if (pch[i].isDigit() || pch[i].isNumber() || pch[i].isSpace() || pch[i].isLower() || pch[i].isUpper() || pch[i].isSymbol() )
|
|
str += pch[i];
|
|
}
|
|
return str;
|
|
}
|
|
|
|
void SCrawler::saveFrameList(QWebFrame *frame)
|
|
{
|
|
if (m_bUse == true) return;
|
|
QWebElement eleMain = Find(frame->documentElement(),"div","class","blog section _blogBase");
|
|
QSqlQuery sql;
|
|
for (int i = 0; i < 10 ; i++)
|
|
{
|
|
QString str = "sp_blog_";
|
|
QWebElement sub = Find(eleMain,"li","id",str+QString::number(i+1));
|
|
QString strUrl = Find(sub,"a","class","url").toPlainText();
|
|
if (strUrl.isEmpty())
|
|
{
|
|
m_bLast = true;
|
|
m_bUse = true;
|
|
return;
|
|
}
|
|
cout << "url : " << strUrl.toStdString();
|
|
|
|
QStringList strList = strUrl.split('/');
|
|
if (strList.at(0).compare("blog.naver.com") != 0 ) { cout << " not" << endl; continue; };
|
|
|
|
QString strQuery = "select article_url from ";
|
|
strQuery += m_strTable;
|
|
strQuery += QString(" where article_url = '%1'").arg(strUrl);
|
|
sql.exec(strQuery);
|
|
|
|
if (sql.size() == -1)
|
|
{
|
|
QString str = Find(sub,"a","class","txt84").toPlainText();
|
|
str = GetSafeUtf(str);
|
|
str.replace("'","\\'");
|
|
str.replace("\"","\\\"");
|
|
str = str.trimmed();
|
|
|
|
QString strQuery = QString("insert into ");
|
|
strQuery += m_strTable;
|
|
strQuery += QString(" set article_url='%1',platform_id='%2',platform_title='%3',keyword_id='%4'").arg("http://"+strUrl).arg(strUrl.split("/").at(1)).arg(str).arg(m_strKeywordID);
|
|
QString strUtf8(strQuery.toUtf8());
|
|
if (sql.exec(strUtf8) == false)
|
|
cout << "error : " << sql.lastError().text().toStdString();
|
|
else
|
|
cout << " ok" << endl;
|
|
}
|
|
else
|
|
cout << " overlap" << endl;
|
|
m_bUse = true;
|
|
}
|
|
|
|
{
|
|
QWebElement total = Find(eleMain,"span","class","title_num");
|
|
if (total.toPlainText().isEmpty()) {m_bError = true; return;}
|
|
int nTotal = GetNumber(total.toPlainText().split("/").at(1));
|
|
QStringList strList = m_strUrl.split("&");
|
|
int nNow = GetNumber(strList.at(strList.size() - 1).split("=").at(1));
|
|
if ((nNow + 10) > nTotal || nNow >= 1000)
|
|
m_bLast = true;
|
|
}
|
|
}
|
|
|
|
enum E_DATA
|
|
{
|
|
E_DATA_NICK=0,
|
|
E_DATA_ID,
|
|
E_DATA_TITLE,
|
|
E_DATA_DATE,
|
|
E_DATA_DATA,
|
|
E_DATA_PLATFORM_TITLE,
|
|
E_DATA_MAX,
|
|
};
|
|
|
|
void SCrawler::saveFrameUrl(QWebFrame *frame)
|
|
{
|
|
/*
|
|
static int cz = 0;
|
|
Debug(frame->frameName() + QString::number(cz++) + ".html",frame->toHtml());
|
|
*/
|
|
|
|
QSqlQuery sql;
|
|
if (frame->frameName().compare(QString("BuddyConnectIframe")) == 0)
|
|
{
|
|
QWebElement profile = Find(frame->documentElement(),"div","class","profile_name");
|
|
QString str = profile.toPlainText().split("\n").at(0);
|
|
if (str.isEmpty() == false)
|
|
{
|
|
QString strQuery = "update " + m_strTable + " set article_nickname = '";
|
|
strQuery += str;
|
|
strQuery += "'";
|
|
strQuery += " where article_url='";
|
|
strQuery += m_strUrl;
|
|
strQuery += "'";
|
|
QString strUtf8(strQuery.toUtf8());
|
|
if (sql.exec(strUtf8)==false)
|
|
{
|
|
cout << "error : " << sql.lastError().text().toStdString();
|
|
UpdateError("Error code 1");
|
|
m_bUse = false;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (frame->frameName().compare(QString("mainFrame")) == 0)
|
|
{
|
|
QString str[E_DATA_MAX];
|
|
//QString strHead[E_DATA_MAX] = {"article_nickname","article_id","article_title","article_date","article_data","platform_title"};
|
|
QString strHead[E_DATA_MAX] = {"article_nickname","article_id","article_title","article_date","article_data"};
|
|
QWebElement proTitle = Find(frame->documentElement(),"td","id","blogTitleText");
|
|
// str[E_DATA_PLATFORM_TITLE] = proTitle.toPlainText().trimmed();
|
|
|
|
QWebElement profile = Find(frame->documentElement(),"div","id","blog-profile");
|
|
{
|
|
QWebElement nick = Find(profile,"strong","id","nickNameArea");
|
|
if (nick.toPlainText().isEmpty()==false)
|
|
str[E_DATA_NICK] = nick.toPlainText();
|
|
|
|
if(str[E_DATA_NICK].isEmpty())
|
|
{
|
|
QString strHtml = frame->toHtml();
|
|
QString strFind = "var nickName = '";
|
|
int start = strHtml.indexOf(strFind);
|
|
if (start == -1)
|
|
{
|
|
cout << "error : nick name can not find and next again connect." << endl;
|
|
}
|
|
if (strHtml.at(start + strFind.length()) == QChar('\''))
|
|
{
|
|
cout << "error : nick name can not find and next again connect." << endl;
|
|
}
|
|
else
|
|
{
|
|
int end = strHtml.indexOf("'",start + strFind.length());
|
|
str[E_DATA_NICK] = strHtml.mid(start + strFind.length(),end-start-strFind.length());
|
|
}
|
|
}
|
|
if (m_strUrl.split("/").at(3) == str[E_DATA_NICK])
|
|
{
|
|
str[E_DATA_ID] = str[E_DATA_NICK];
|
|
}
|
|
else
|
|
{
|
|
QWebElement id = Find(profile,"span","class","itemfont col");
|
|
if (id.toPlainText().isEmpty()==false)
|
|
{
|
|
str[E_DATA_ID] = id.toPlainText();
|
|
str[E_DATA_ID] = str[E_DATA_ID].replace("(","").replace(")","");
|
|
}
|
|
|
|
if (str[E_DATA_ID].isEmpty())
|
|
str[E_DATA_ID] = m_strUrl.split("/").at(3);
|
|
}
|
|
//QWebElement image = Find(profile,"img","alt","프로필 이미지");
|
|
}
|
|
{
|
|
QWebElement post = Find(frame->documentElement(),"div","id","postListBody");
|
|
QWebElement post_top = Find(post,"table","class","post-top");
|
|
|
|
{
|
|
QWebElement title = Find(post_top,"div","class","htitle");
|
|
if (title.toPlainText().isEmpty()==false)
|
|
{
|
|
str[E_DATA_TITLE] = title.toPlainText();
|
|
str[E_DATA_TITLE] = GetSafeUtf(str[E_DATA_TITLE]);
|
|
}
|
|
}
|
|
|
|
{
|
|
QWebElement date = Find(post_top,"p","class","date fil5 pcol2 _postAddDate");
|
|
str[E_DATA_DATE] = date.toPlainText().trimmed().replace("/","-");
|
|
if ( str[E_DATA_DATE].isEmpty() == false)
|
|
{
|
|
str[E_DATA_DATE] += ":00";
|
|
cout << "str[E_DATA_DATE] = " << str[E_DATA_DATE].toStdString() << endl;
|
|
}
|
|
else
|
|
{
|
|
UpdateError("Error code 4");
|
|
m_bUse = false;
|
|
}
|
|
}
|
|
|
|
{
|
|
QWebElement body = Find(post,"div","class","post-view pcol2 _param(1)");
|
|
if (body.toPlainText().isEmpty()==false)
|
|
{
|
|
str[E_DATA_DATA] = body.toPlainText();
|
|
str[E_DATA_DATA] = GetSafeUtf(str[E_DATA_DATA]);
|
|
}
|
|
}
|
|
}
|
|
|
|
QString strQuery = "update " + m_strTable + " set ";
|
|
for(int i = 0; i < E_DATA_MAX - 1 ; i++)
|
|
{
|
|
strQuery += strHead[i];
|
|
strQuery += "='";
|
|
//strQuery += GetSafeUtf(SqlString(str[i].trimmed()));
|
|
strQuery += str[i].trimmed();
|
|
strQuery += "'";
|
|
if( i != (E_DATA_MAX - 2) )
|
|
strQuery += ",";
|
|
}
|
|
strQuery += " where article_url='";
|
|
strQuery += m_strUrl;
|
|
strQuery += "'";
|
|
QString strUtf8(strQuery.toUtf8());
|
|
//qDebug() << strQuery;
|
|
//cout << "Query : " << strQuery.toStdString() << endl;
|
|
if (sql.exec(strUtf8)==false)
|
|
{
|
|
cout << "error : " << sql.lastError().text().toStdString();
|
|
UpdateError("Error code 5");
|
|
m_bUse = false;
|
|
}
|
|
}
|
|
|
|
foreach(QWebFrame *childFrame, frame->childFrames())
|
|
saveFrameUrl(childFrame);
|
|
}
|
|
|
|
void SCrawler::saveFrameComment(QWebFrame *frame)
|
|
{
|
|
QWebElement group = Find(frame->documentElement(),"ul","id","commentList");
|
|
QWebElementCollection elements = group.findAll("li");
|
|
QString strParent,strDate,strNick,strComm,strUrl;
|
|
QStringList strList = m_strUrl.split("/");
|
|
for (int i=0; i < 5; i++)
|
|
strUrl += strList.at(i) + "/";
|
|
|
|
strUrl = strUrl.left(strUrl.size()-1);
|
|
int nCount=0;
|
|
foreach (QWebElement element, elements)
|
|
{
|
|
if (element.attribute("class") == "_countableComment ")
|
|
{
|
|
strNick = strParent = Find(element,"a","class","nick pcol2").toPlainText();
|
|
strDate = Find(element,"span","class","date fil5 pcol2").toPlainText();
|
|
strComm = Find(element,"dd","class","comm pcol2").toPlainText();
|
|
//strComm = GetSafeUtf(strComm);
|
|
if (strComm.isEmpty()== false)
|
|
{
|
|
strComm.replace("'","\\'");
|
|
strComm.replace("\"","\\\"");
|
|
strComm = strComm.trimmed();
|
|
//cout << strComm.toStdString() << endl;
|
|
QSqlQuery query;
|
|
query.prepare(QString("insert into " + m_strTable + " (article_url,article_nickname,article_data,article_date,article_parent,reply_url,article_order) VALUES (:URL,:NICK,:DATA,:DATE,:PARENT,:URLREPLY,:ROWNUM)").toUtf8());
|
|
cout << "m_strTable = " << m_strTable.toStdString() << endl;
|
|
cout << "url = " << strUrl.toStdString() << endl << "nickname = " << strNick.toStdString() << endl;
|
|
cout << "data = " << strComm.toStdString() << endl;
|
|
cout << "date = " << strDate.toStdString() << endl;
|
|
cout << "urlReply = " << m_strUrl.toStdString() << endl;
|
|
cout << "ronum = " << nCount << endl;
|
|
query.bindValue(":URL", strUrl.toUtf8());
|
|
query.bindValue(":NICK",strNick.toUtf8());
|
|
query.bindValue(":DATA",strComm.toUtf8());
|
|
query.bindValue(":DATE",strDate.toUtf8());
|
|
query.bindValue(":PARENT",QString("").toUtf8());
|
|
query.bindValue(":URLREPLY",m_strUrl.toUtf8());
|
|
query.bindValue(":ROWNUM",(nCount++));
|
|
|
|
if (query.exec()==false)
|
|
cout << "error : " << query.lastError().text().toStdString();
|
|
}
|
|
}
|
|
if (element.attribute("class") == "reply _countableComment ")
|
|
{
|
|
strNick = Find(element,"a","class","nick pcol2").toPlainText();
|
|
strDate = Find(element,"span","class","date fil5 pcol2").toPlainText();
|
|
QWebElement subElement = Find(element,"dd","class","comm pcol2");
|
|
QString subNick = Find(subElement,"a","class","nick pcol2").toPlainText();
|
|
strComm = subElement.toPlainText();
|
|
if(subNick.isEmpty() == false)
|
|
{
|
|
strNick = strParent;
|
|
strComm = strComm.right(strComm.size()-subNick.size()-1);
|
|
}
|
|
|
|
if (strComm.isEmpty() == false)
|
|
{
|
|
// strComm = GetSafeUtf(strComm);
|
|
strComm.replace("'","\\'");
|
|
strComm.replace("\"","\\\"");
|
|
strComm = strComm.trimmed();
|
|
QSqlQuery query;
|
|
cout << "m_strTable = " << m_strTable.toStdString() << endl;
|
|
cout << "url = " << strUrl.toStdString() << endl << "nickname = " << strNick.toStdString() << endl;
|
|
cout << "data = " << strComm.toStdString() << endl;
|
|
cout << "date = " << strDate.toStdString() << endl;
|
|
cout << "urlReply = " << m_strUrl.toStdString() << endl;
|
|
cout << "ronum = " << nCount << endl;
|
|
cout << "parent = " << strParent.toStdString() << endl;
|
|
query.prepare(QString("insert into " + m_strTable + " (article_url,article_nickname,article_data,article_date,article_parent,reply_url,article_order) VALUES (:URL,:NICK,:DATA,:DATE,:PARENT,:URLREPLY,:ROWNUM)").toUtf8());
|
|
query.bindValue(":URL",strUrl.toUtf8());
|
|
query.bindValue(":NICK",strNick.toUtf8());
|
|
query.bindValue(":DATA",strComm.toUtf8());
|
|
query.bindValue(":DATE",strDate.toUtf8());
|
|
query.bindValue(":PARENT",strParent.toUtf8());
|
|
query.bindValue(":URLREPLY",m_strUrl.toUtf8());
|
|
query.bindValue(":ROWNUM",(nCount++));
|
|
if (query.exec()==false)
|
|
{
|
|
cout << "error : " << query.lastError().text().toStdString();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void SCrawler::saveFrameCafeList(QWebFrame *frame)
|
|
{
|
|
if (m_bUse == true) return;
|
|
|
|
static int cz = 0;
|
|
Debug(frame->frameName() + QString::number(cz++) + ".html",frame->toHtml());
|
|
|
|
QWebElement eleMain = Find(frame->documentElement(),"div","class","cafe_article section _cafeBase");
|
|
foreach(QWebElement eleSub,eleMain.findAll("li"))
|
|
{
|
|
if (eleSub.attribute("class") == "sh_cafe_top")
|
|
{
|
|
QString strUrl,strTitle;
|
|
foreach(QWebElement eleSubUrl,eleSub.findAll("a"))
|
|
{
|
|
if (eleSubUrl.attribute("class") == "url")
|
|
strUrl = eleSubUrl.attribute("href");
|
|
|
|
if (eleSubUrl.attribute("class") == "sh_cafe_title")
|
|
strTitle = GetSafeUtf(SqlString(eleSubUrl.toPlainText().trimmed()));
|
|
}
|
|
|
|
if (strUrl.split("/").at(2) == "cafe.naver.com")
|
|
{
|
|
QSqlQuery sql;
|
|
/*
|
|
QString strQuery = "select URL from ";
|
|
strQuery += m_strTableBody;
|
|
strQuery += QString(" where URL = '%1'").arg(strUrl);
|
|
sql.exec(strQuery);
|
|
if (sql.size() == 0)
|
|
*/
|
|
{
|
|
QString strQuery = QString("insert into ");
|
|
strQuery += m_strTable;
|
|
strQuery += QString(" set platform_name='naver',platform_form='cafe',article_form='body',article_url='%1',platform_id='%2',article_title='%3',keyword_id='%4'").arg(strUrl).arg(strUrl.split("/").at(3)).arg(strTitle).arg(m_strKeywordID);
|
|
QString strUtf8(strQuery.toUtf8());
|
|
if (sql.exec(strUtf8) == false)
|
|
cout << "x " << sql.lastError().text().toStdString();
|
|
else
|
|
cout << "o " << strUrl.toStdString() << endl;
|
|
}
|
|
}
|
|
m_bUse = true;
|
|
}
|
|
}
|
|
|
|
{
|
|
QWebElement total = Find(eleMain,"span","class","title_num");
|
|
if (total.toPlainText().isEmpty()) {m_bError = true; return;}
|
|
total.toPlainText().split("/").size();
|
|
int nTotal = GetNumber(total.toPlainText().split("/").at(1));
|
|
QStringList strList = m_strUrl.split("&");
|
|
int nNow = GetNumber(strList.at(strList.size() - 1).split("=").at(1));
|
|
if ((nNow + 10) > nTotal || nNow >= 1000)
|
|
m_bLast = true;
|
|
}
|
|
}
|
|
|
|
void SCrawler::saveFrameCafeUrl(QWebFrame *frame)
|
|
{
|
|
if (m_bUse) return;
|
|
|
|
QWebElement other = Find(frame->documentElement(),"h1","class","d-none");
|
|
if (other.toPlainText().isEmpty() == false)
|
|
{
|
|
QString strQuery = "update ";
|
|
strQuery += m_strTable;
|
|
strQuery += " set ";
|
|
strQuery += "platform_title = '" + SqlString(GetSafeUtf(other.toPlainText())) + "'";
|
|
strQuery += "where article_url='";
|
|
strQuery += m_strUrl;
|
|
strQuery += "'";
|
|
QString strUtf8(strQuery.toUtf8());
|
|
QSqlQuery sql;
|
|
if (sql.exec(strUtf8) == false)
|
|
cout << "error : " << sql.lastError().text().toStdString();
|
|
}
|
|
|
|
if (frame->frameName() == "cafe_main")
|
|
{
|
|
{
|
|
QString strData,strDate,strNick,strID,strHits;
|
|
{
|
|
QWebElement group = Find(frame->documentElement(),"div","class","tbody m-tcol-c");
|
|
strData = SqlString(group.toPlainText().trimmed());
|
|
strData = GetSafeUtf(strData);
|
|
}
|
|
{
|
|
QWebElement group = Find(frame->documentElement(),"td","class","m-tcol-c date");
|
|
strDate = group.toPlainText().trimmed().replace(".","-");
|
|
if (strDate.isEmpty() == true)
|
|
{
|
|
QWebElement subgroup = Find(frame->documentElement(),"em","class","date m-tcol-c");
|
|
strDate = subgroup.toPlainText().trimmed().replace(".","-");
|
|
strDate += " 00:00:00";
|
|
}
|
|
else
|
|
strDate += ":00";
|
|
}
|
|
|
|
{
|
|
QWebElement group = Find(Find(frame->documentElement(),"div","class","etc-box"),"td","class","p-nick");
|
|
strNick = group.toPlainText().trimmed();
|
|
|
|
if (strNick.isEmpty() == false)
|
|
{
|
|
QStringList list = strNick.split("(");
|
|
if (list.isEmpty() == false)
|
|
strNick = list.at(0);
|
|
QWebElement id = Find(frame->documentElement(),"td","class","m-tcol-c b nick");
|
|
list = Find(id,"a","class","m-tcol-c b").attribute("onclick").trimmed().split(",");
|
|
if (list.size() >= 2)
|
|
strID = list.at(1).trimmed().replace("'","");
|
|
}
|
|
else
|
|
{
|
|
QStringList list = Find(Find(frame->documentElement(),"td","class","m-tcol-c b nick"),"a","class","m-tcol-c b").attribute("onclick").trimmed().split(",");
|
|
if (list.size() >= 4)
|
|
{
|
|
strID = list.at(1).trimmed().replace("'","");
|
|
strNick = list.at(3).trimmed().replace("'","");
|
|
}
|
|
}
|
|
|
|
}
|
|
strHits = Find(frame->documentElement(),"span","class","b m-tcol-c reply _rosReadcount").toPlainText();
|
|
if (strHits.isEmpty())
|
|
{
|
|
strHits = Find(frame->documentElement(),"span","class","kin_count m-tcol-c _rosReadcount").toPlainText();
|
|
}
|
|
{
|
|
QSqlQuery sql;
|
|
QString strQuery = "update ";
|
|
strQuery += m_strTable;
|
|
strQuery += " set ";
|
|
strQuery += "article_data = '" + strData + "',";
|
|
strQuery += "article_date = '" + strDate + "',";
|
|
strQuery += "article_nickname = '" + strNick + "',";
|
|
strQuery += "article_id = '" + strID + "',";
|
|
strQuery += "article_hit = '" + strHits + "'";
|
|
strQuery += "where article_url='";
|
|
strQuery += m_strUrl;
|
|
strQuery += "'";
|
|
QString strUtf8(strQuery.toUtf8());
|
|
if (sql.exec(strUtf8) == false)
|
|
cout << "error : " << sql.lastError().text().toStdString();
|
|
}
|
|
}
|
|
// Comment
|
|
{
|
|
QWebElement group = Find(frame->documentElement(),"ul","id","cmt_list");
|
|
QWebElementCollection elements = group.findAll("li");
|
|
QString strParent;
|
|
int nCount = 0;
|
|
foreach (QWebElement element, elements)
|
|
{
|
|
if (element.attribute("class").isEmpty())
|
|
{
|
|
QString strData = SqlString(Find(element,"span","class","comm_body").toPlainText().trimmed());
|
|
if (strData.isEmpty()) continue;
|
|
strData = GetSafeUtf(strData);
|
|
QString strID = Find(element,"input","name","writerid").attribute("value").trimmed();
|
|
QString strNick = Find(element,"td","class","p-nick").toPlainText().trimmed();
|
|
strParent = strNick;
|
|
QString strDate = Find(element,"span","class","date m-tcol-c filter-50").toPlainText().trimmed();
|
|
if (strDate.isEmpty()) continue;
|
|
|
|
QSqlQuery query;
|
|
query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,reply_url,article_order) VALUES ('naver','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:URLREPLY,:ROWNUM)").toUtf8());
|
|
query.bindValue(":URL",m_strUrl.toUtf8());
|
|
query.bindValue(":ID",strID.toUtf8());
|
|
query.bindValue(":NICK",strNick.toUtf8());
|
|
query.bindValue(":DATA",strData.toUtf8());
|
|
query.bindValue(":DATE",strDate.toUtf8());
|
|
query.bindValue(":URLREPLY",m_strReper.toUtf8());
|
|
query.bindValue(":ROWNUM",nCount++);
|
|
|
|
if (query.exec()==false)
|
|
cout << "error : " << query.lastError().text().toStdString();
|
|
}
|
|
if (element.attribute("class") == "reply")
|
|
{
|
|
QString strData = Find(element,"span","class","comm_body").toPlainText().trimmed();
|
|
if (strData.isEmpty()) continue;
|
|
QString strReParent = strParent;
|
|
QString strID = Find(element,"input","name","writerid").attribute("value").trimmed();
|
|
QString strNick = Find(element,"td","class","p-nick").toPlainText().trimmed();
|
|
QString strDate = Find(element,"span","class","date m-tcol-c filter-50").toPlainText().trimmed();
|
|
QWebElement eleParent = Find(element,"span","class","re-p-nick");
|
|
if (eleParent.toPlainText().isEmpty() == false)
|
|
strReParent = eleParent.toPlainText();
|
|
QSqlQuery query;
|
|
query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,article_parent,reply_url,article_order) VALUES ('naver','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:PARENT,:URLREPLY,:ROWNUM)").toUtf8());
|
|
query.bindValue(":URL",m_strUrl.toUtf8());
|
|
query.bindValue(":ID",strID.toUtf8());
|
|
query.bindValue(":NICK",strNick.toUtf8());
|
|
query.bindValue(":DATA",strData.toUtf8());
|
|
query.bindValue(":DATE",strDate.toUtf8());
|
|
query.bindValue(":PARENT",strReParent.toUtf8());
|
|
query.bindValue(":URLREPLY",m_strReper.toUtf8());
|
|
query.bindValue(":ROWNUM",nCount++);
|
|
|
|
if (query.exec()==false)
|
|
cout << "error : " << query.lastError().text().toStdString();
|
|
}
|
|
}
|
|
}
|
|
m_bUse = true;
|
|
}
|
|
foreach(QWebFrame *childFrame, frame->childFrames())
|
|
saveFrameCafeUrl(childFrame);
|
|
}
|
|
|
|
QWebElement SCrawler::Find(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind)
|
|
{
|
|
QWebElementCollection elements = _FindElement.findAll(_strElement);
|
|
foreach (QWebElement element, elements)
|
|
{
|
|
if (element.attribute(_strAttrib) == _strFind)
|
|
{
|
|
return element;
|
|
}
|
|
}
|
|
QWebElement element;
|
|
return element;
|
|
}
|