2485 lines
94 KiB
C++
2485 lines
94 KiB
C++
#include "scrawler.h"
|
||
#include <iostream>
|
||
#include <QSqlQuery>
|
||
#include <QSqlError>
|
||
#include <QByteArray>
|
||
#include <QDebug>
|
||
#include <QTimer>
|
||
#include <QThread>
|
||
#include <QRegExp>
|
||
#include <ctime>
|
||
|
||
class SWebPage:public QWebPage
|
||
{
|
||
public:
|
||
SWebPage(QObject * parent = 0): QWebPage(parent){}
|
||
protected:
|
||
void javaScriptAlert(QWebFrame * frame, const QString & msg){
|
||
std::cout << "deletedurl";
|
||
exit(1);
|
||
}
|
||
//bool javaScriptConfirm(QWebFrame * frame, const QString & msg){}
|
||
};
|
||
|
||
using namespace std;
|
||
|
||
const int RETRY_MAX = 4;
|
||
const int RETRY_INTERVAL = 3000;
|
||
|
||
struct SProxyList
|
||
{
|
||
QString m_strAddress;
|
||
int m_nPort;
|
||
};
|
||
|
||
SCrawler::SCrawler():QObject()
|
||
{
|
||
m_page = new SWebPage;
|
||
m_nRetryCount = 0;
|
||
m_bProcessed = false;
|
||
connect(m_page, SIGNAL(loadFinished(bool)), this, SLOT(saveResult(bool)));
|
||
srand(time(NULL));
|
||
}
|
||
|
||
SCrawler::~SCrawler()
|
||
{
|
||
}
|
||
|
||
void SCrawler::load(QStringList _strlistArgv)
|
||
{
|
||
m_bUse = false;
|
||
m_bNothing = false;
|
||
if (_strlistArgv[0] == "naver")
|
||
{
|
||
if (_strlistArgv[1] == "news_list")
|
||
{
|
||
m_strUrl = _strlistArgv[2];
|
||
m_nSelect = E_NAVER_NEWS_LIST;
|
||
setProxy();
|
||
}
|
||
|
||
if (_strlistArgv[1] == "cafe_list")
|
||
{
|
||
m_strUrl = _strlistArgv[2];
|
||
m_nSelect = E_NAVER_CAFE_LIST;
|
||
m_strKeywordID = _strlistArgv[4];
|
||
setProxy();
|
||
}
|
||
|
||
if (_strlistArgv[1] == "cafe_data")
|
||
{
|
||
m_strUrl = _strlistArgv[2];
|
||
m_nSelect = E_NAVER_CAFE_DATA;
|
||
m_strReper = _strlistArgv[4];
|
||
m_strKeywordID = _strlistArgv[5];
|
||
}
|
||
|
||
if (_strlistArgv[1] == "blog_list")
|
||
{
|
||
m_strUrl = _strlistArgv[2];
|
||
m_nSelect = E_NAVER_BLOG_LIST;
|
||
m_strKeywordID = _strlistArgv[4];
|
||
setProxy();
|
||
}
|
||
|
||
if (_strlistArgv[1] == "blog_url")
|
||
{
|
||
m_strUrl = _strlistArgv[2];
|
||
m_nSelect = E_NAVER_BLOG_BODY;
|
||
m_strKeywordID = _strlistArgv[4];
|
||
m_bUse = true;
|
||
}
|
||
|
||
if (_strlistArgv[1] == "blog_comm")
|
||
{
|
||
m_strUrl = _strlistArgv[2];
|
||
m_nSelect = E_NAVER_BLOG_REPLY;
|
||
}
|
||
|
||
if (_strlistArgv[1] == "news_data")
|
||
{
|
||
m_strUrl = _strlistArgv[2];
|
||
m_nSelect = E_NAVER_NEWS_DATA;
|
||
m_strKeywordID = _strlistArgv[4];
|
||
}
|
||
|
||
if (_strlistArgv[1] == "news_comm")
|
||
{
|
||
m_strUrl = _strlistArgv[2];
|
||
m_nSelect = E_NAVER_NEWS_REPLY;
|
||
}
|
||
|
||
|
||
if (_strlistArgv.size() > 3)
|
||
m_strTable = "data_" + _strlistArgv[3];
|
||
}
|
||
|
||
|
||
if (_strlistArgv[0] == "daum")
|
||
{
|
||
if (_strlistArgv[1] == "cafe_list")
|
||
{
|
||
m_strUrl = _strlistArgv[2];
|
||
m_nSelect = E_DAUM_CAFE_LIST;
|
||
m_strKeywordID = _strlistArgv[4];
|
||
setProxy();
|
||
}
|
||
|
||
if (_strlistArgv[1] == "cafe_data")
|
||
{
|
||
m_strUrl = _strlistArgv[2];
|
||
m_nSelect = E_DAUM_CAFE_DATA;
|
||
m_strReper = _strlistArgv[4];
|
||
m_strKeywordID = _strlistArgv[5];
|
||
}
|
||
|
||
if (_strlistArgv[1] == "blog_list")
|
||
{
|
||
m_strUrl = _strlistArgv[2];
|
||
m_nSelect = E_DAUM_BLOG_LIST;
|
||
m_strKeywordID = _strlistArgv[4];
|
||
//cout << "ok";
|
||
setProxy();
|
||
}
|
||
|
||
if (_strlistArgv[1] == "blog_url")
|
||
{
|
||
m_strUrl = _strlistArgv[2];
|
||
m_nSelect = E_DAUM_BLOG_BODY;
|
||
//m_strReper = _strlistArgv[4];
|
||
m_bUse = true;
|
||
}
|
||
|
||
if (_strlistArgv[1] == "blog_comm")
|
||
{
|
||
m_strUrl = _strlistArgv[2];
|
||
m_nSelect = E_DAUM_BLOG_REPLY;
|
||
}
|
||
m_strTable = "data_" + _strlistArgv[3];
|
||
}
|
||
cout << m_strUrl.toStdString() << endl;
|
||
|
||
QUrl url = QUrl(m_strUrl);
|
||
if (url.scheme().isEmpty())
|
||
url.setScheme("http");
|
||
|
||
m_page->settings()->setAttribute(QWebSettings::AutoLoadImages,false);
|
||
QNetworkRequest *request = new QNetworkRequest;
|
||
request->setUrl(url);
|
||
/*
|
||
request->setRawHeader("Cache-Control","max-age=0, no-cache");
|
||
request->setRawHeader("Pragma","no-cache");
|
||
request->setRawHeader("Expires","Thu, 01 Jan 1970 16:00:00 GMT");
|
||
*/
|
||
if (m_strReper.isEmpty() == false && m_nSelect == E_NAVER_CAFE_DATA)
|
||
request->setRawHeader("Referer",m_strReper.toLocal8Bit());
|
||
request->setRawHeader("Accept-Language","ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4,zh-CN;q=0.2,zh;q=0.2");
|
||
m_page->mainFrame()->load(*request);
|
||
m_bLast = false;
|
||
m_bError = false;
|
||
}
|
||
|
||
void SCrawler::UpdateError(QString _strError)
|
||
{
|
||
/*
|
||
QSqlQuery sql;
|
||
_strError = "'" + _strError + "'";
|
||
QString strQuery = "update " + m_strTableBody + " set ERROR = " + _strError;
|
||
strQuery += "where URL='";
|
||
strQuery += m_strUrl;
|
||
strQuery += "'";
|
||
QString strUtf8(strQuery.toUtf8());
|
||
sql.exec(strUtf8);
|
||
*/
|
||
m_bError = true;
|
||
}
|
||
|
||
void SCrawler::saveResult(bool ok)
|
||
{
|
||
// qDebug() << "saveResult";
|
||
|
||
if (!ok)
|
||
{
|
||
cout << "Failed loading";
|
||
deleteProxy();
|
||
emit finished();
|
||
return;
|
||
}
|
||
|
||
//qDebug() << "load complete";
|
||
switch(m_nSelect)
|
||
{
|
||
case E_NAVER_NEWS_LIST:saveFrameNewsList(m_page->mainFrame());break;
|
||
case E_NAVER_NEWS_DATA:
|
||
{
|
||
static bool loaded = false;
|
||
if(!loaded)
|
||
{
|
||
loaded = true;
|
||
if(!saveFrameNewsUrl(m_page->mainFrame()))
|
||
{
|
||
loaded = false;
|
||
return;
|
||
}
|
||
bodydata.sendDB();
|
||
}
|
||
loaded = true;
|
||
break;
|
||
}
|
||
case E_NAVER_NEWS_REPLY:
|
||
{
|
||
if(!saveFrameNewsComment(m_page->mainFrame()))
|
||
return;
|
||
break;
|
||
}
|
||
case E_NAVER_CAFE_LIST:saveFrameCafeList(m_page->mainFrame());break;
|
||
case E_NAVER_CAFE_DATA:
|
||
{
|
||
saveFrameCafeUrl(m_page->mainFrame());
|
||
bodydata.sendDB();
|
||
break;
|
||
}
|
||
case E_NAVER_BLOG_LIST:
|
||
{
|
||
if(saveFrameList(m_page->mainFrame()))
|
||
break;
|
||
else
|
||
return;
|
||
}
|
||
case E_NAVER_BLOG_BODY:
|
||
{
|
||
if(!saveFrameUrl(m_page->mainFrame()))
|
||
return;
|
||
bodydata.sendDB();
|
||
break;
|
||
}
|
||
case E_NAVER_BLOG_REPLY:saveFrameComment(m_page->mainFrame());break;
|
||
case E_DAUM_CAFE_LIST:saveFrameDaumCafeList(m_page->mainFrame());break;
|
||
case E_DAUM_CAFE_DATA:
|
||
{
|
||
saveFrameDaumCafeUrl(m_page->mainFrame());
|
||
bodydata.sendDB();
|
||
break;
|
||
}
|
||
case E_DAUM_BLOG_LIST:saveFrameDaumBlogList(m_page->mainFrame());break;
|
||
case E_DAUM_BLOG_BODY:
|
||
{
|
||
saveFrameDaumBlogUrl(m_page->mainFrame());
|
||
bodydata.sendDB();
|
||
break;
|
||
}
|
||
case E_DAUM_BLOG_REPLY:saveFrameDaumBlogComment(m_page->mainFrame());break;
|
||
}
|
||
|
||
switch(m_nSelect)
|
||
{
|
||
case E_NAVER_CAFE_LIST:
|
||
case E_NAVER_BLOG_LIST:
|
||
case E_DAUM_CAFE_LIST:
|
||
case E_DAUM_BLOG_LIST:
|
||
case E_NAVER_NEWS_LIST:
|
||
if (m_bError)
|
||
{
|
||
cout << "block";// block
|
||
deleteProxy();
|
||
break;
|
||
}
|
||
if (m_bNothing == false)
|
||
{
|
||
cout << "nothing";
|
||
m_bNothing = true;
|
||
}
|
||
if (m_bLast)
|
||
{
|
||
cout << "last";
|
||
m_bLast = false;
|
||
}
|
||
|
||
break;
|
||
case E_NAVER_BLOG_REPLY:
|
||
case E_NAVER_NEWS_REPLY:
|
||
case E_DAUM_BLOG_REPLY:
|
||
if (m_bUse)
|
||
{
|
||
cout << "ok";
|
||
m_bUse = false;
|
||
}
|
||
break;
|
||
case E_NAVER_CAFE_DATA:
|
||
case E_NAVER_BLOG_BODY:
|
||
case E_DAUM_CAFE_DATA:
|
||
case E_DAUM_BLOG_BODY:
|
||
case E_NAVER_NEWS_DATA:
|
||
if (m_bUse == false)
|
||
{
|
||
cout << "fail";
|
||
UpdateError("Error code 0");
|
||
}
|
||
else
|
||
{
|
||
if (m_bError == false)
|
||
{
|
||
cout << "ok";
|
||
UpdateError("ok");
|
||
}
|
||
}
|
||
break;
|
||
}
|
||
emit finished();
|
||
|
||
}
|
||
|
||
int SCrawler::GetNumber(QString _str)
|
||
{
|
||
QString strNumber;
|
||
for (int i = 0; i < _str.size();i++)
|
||
{
|
||
if (_str.at(i).isNumber())
|
||
strNumber += _str.at(i);
|
||
}
|
||
return strNumber.toInt();
|
||
}
|
||
|
||
|
||
int SCrawler::GetNumber(QString _str, bool &ok)
|
||
{
|
||
QString strNumber;
|
||
for (int i = 0; i < _str.size();i++)
|
||
{
|
||
if (_str.at(i).isNumber())
|
||
strNumber += _str.at(i);
|
||
}
|
||
return strNumber.toInt(&ok);
|
||
}
|
||
|
||
|
||
void SCrawler::Debug(QString _strFilename,QString _strData)
|
||
{
|
||
QFile file(_strFilename);
|
||
if (!file.open(QIODevice::WriteOnly | QIODevice::Text | QIODevice::Append))
|
||
return;
|
||
QTextStream out(&file);
|
||
out << _strData;
|
||
file.close();
|
||
}
|
||
|
||
QString SCrawler::SqlString(QString _str)
|
||
{
|
||
_str = _str.replace("'","\\'");
|
||
_str = _str.replace("\"","\\\"");
|
||
return _str;
|
||
}
|
||
|
||
|
||
QString SCrawler::GetSafeUtf(QString _strData)
|
||
{
|
||
QString str;
|
||
QChar *pch = _strData.data();
|
||
|
||
for (int i = 0; i < _strData.length(); i++)
|
||
{
|
||
if (pch[i].unicode() >= 12593 && pch[i].unicode() <= 12622)
|
||
str += pch[i];
|
||
if (pch[i].unicode() >= 44032 && pch[i].unicode() <= 55203)
|
||
str += pch[i];
|
||
if (pch[i].isDigit() || pch[i].isNumber() || pch[i].isSpace() || pch[i].isLower() || pch[i].isUpper() || pch[i].isSymbol() )
|
||
str += pch[i];
|
||
}
|
||
return str;
|
||
}
|
||
|
||
void SCrawler::reloadListPage()
|
||
{
|
||
++m_nRetryCount;
|
||
if (m_nRetryCount >= RETRY_MAX)
|
||
{
|
||
cout << "block";
|
||
emit finished();
|
||
return;
|
||
}
|
||
m_bProcessed = false;
|
||
saveResult(true);
|
||
}
|
||
|
||
|
||
bool SCrawler::saveFrameList(QWebFrame *frame)
|
||
{
|
||
|
||
if (m_bProcessed == false)
|
||
m_bProcessed = true;
|
||
else
|
||
return false;
|
||
|
||
//qDebug() << frame->documentElement().toPlainText();
|
||
|
||
if (m_bUse == true) return true;
|
||
QWebElement notFound = Find(frame->documentElement(),"div","id","notfound");
|
||
if(notFound.isNull() == false)
|
||
{
|
||
m_bLast = true;
|
||
return true;
|
||
}
|
||
|
||
QWebElement naverBlock = Find(frame->documentElement(), "p", "class", "info01");
|
||
if (!naverBlock.isNull())
|
||
{
|
||
m_bError = true;
|
||
cout << "naver";
|
||
return true;
|
||
}
|
||
|
||
QWebElement eleMain = Find(frame->documentElement(),"div","class","blog section _blogBase");
|
||
QStringList urlList;
|
||
|
||
if (eleMain.isNull())
|
||
{
|
||
QTimer::singleShot(RETRY_INTERVAL, this, SLOT(reloadListPage()));
|
||
return false;
|
||
}
|
||
|
||
for (int i = 0; i < 10 ; i++)
|
||
{
|
||
QString str = "sp_blog_";
|
||
QWebElement sub = Find(eleMain,"li","id",str+QString::number(i+1));
|
||
QString strUrl = Find(sub,"a","class","url").attribute("href");
|
||
if (strUrl.isEmpty())
|
||
{
|
||
//m_bLast = true;
|
||
//m_bUse = true;
|
||
break;
|
||
}
|
||
|
||
strUrl = strUrl.replace("http://","");
|
||
strUrl = strUrl.replace("?Redirect=Log&logNo=","/", Qt::CaseInsensitive);
|
||
|
||
QStringList strList = strUrl.split('/');
|
||
|
||
QString strBlogMe = "blog.me";
|
||
|
||
if ((strList.at(0).compare("blog.naver.com") != 0 ) && (strList.at(0).right(strBlogMe.length()).compare(strBlogMe) != 0))
|
||
{
|
||
continue;
|
||
}
|
||
|
||
if((strList.at(0).right(strBlogMe.length()).compare(strBlogMe) == 0))
|
||
{
|
||
QStringList strSubList = strList.at(0).split('.');
|
||
strUrl = "blog.naver.com/";
|
||
strUrl += strSubList.at(0);
|
||
strUrl += "/";
|
||
strUrl += strList.at(1);
|
||
}
|
||
|
||
urlList << QString("http://%1").arg(strUrl);
|
||
|
||
}
|
||
|
||
|
||
if(urlList.size() > 0)
|
||
{
|
||
QString strUrlList;
|
||
strUrlList = "(";
|
||
foreach(QString str, urlList)
|
||
{
|
||
strUrlList += "'";
|
||
strUrlList += str;
|
||
strUrlList += "',";
|
||
}
|
||
strUrlList = strUrlList.left(strUrlList.size() - 1);
|
||
strUrlList += ")";
|
||
|
||
QSqlQuery sql;
|
||
|
||
QString strQuery = "delete from ";
|
||
strQuery += m_strTable;
|
||
strQuery += QString(" where article_url in %1").arg(strUrlList);
|
||
//qDebug() << strQuery;
|
||
|
||
if (sql.exec(strQuery.toUtf8()) == false)
|
||
{
|
||
cout << "error " << sql.lastError().text().toStdString();
|
||
cout << strQuery.toStdString();
|
||
}
|
||
}
|
||
|
||
for (int i = 0; i < 10 ; i++)
|
||
{
|
||
QString str = "sp_blog_";
|
||
QWebElement sub = Find(eleMain,"li","id",str+QString::number(i+1));
|
||
QString strUrl = Find(sub,"a","class","url").attribute("href");
|
||
if (strUrl.isEmpty())
|
||
{
|
||
|
||
//m_bLast = true;
|
||
m_bUse = true;
|
||
break;
|
||
}
|
||
strUrl = strUrl.replace("http://","");
|
||
strUrl = strUrl.replace("?Redirect=Log&logNo=","/", Qt::CaseInsensitive);
|
||
QStringList strList = strUrl.split('/');
|
||
|
||
QString strBlogMe = "blog.me";
|
||
|
||
if ((strList.at(0).compare("blog.naver.com") != 0 ) && (strList.at(0).right(strBlogMe.length()).compare(strBlogMe) != 0))
|
||
{
|
||
cout << "x http://" << strUrl.toStdString() <<endl; continue;
|
||
}
|
||
|
||
if((strList.at(0).right(strBlogMe.length()).compare(strBlogMe) == 0))
|
||
{
|
||
QStringList strSubList = strList.at(0).split('.');
|
||
strUrl = "blog.naver.com/";
|
||
strUrl += strSubList.at(0);
|
||
strUrl += "/";
|
||
strUrl += strList.at(1);
|
||
}
|
||
|
||
/*
|
||
QString strQuery = "select article_url from ";
|
||
strQuery += m_strTable;
|
||
strQuery += QString(" where article_url = 'http://%1'").arg(strUrl);
|
||
sql.exec(strQuery);
|
||
*/
|
||
|
||
//if (sql.size() == 0 || sql.size() == -1)
|
||
{
|
||
QString str = Find(sub,"a","class","txt84").toPlainText();
|
||
str = GetSafeUtf(str);
|
||
str.replace("'","\\'");
|
||
str.replace("\"","\\\"");
|
||
str = str.trimmed();
|
||
QString strPlatformId;
|
||
|
||
if(strUrl.split("/").at(0).compare("blog.naver.com") == 0)
|
||
strPlatformId = strUrl.split("/").at(1);
|
||
else
|
||
strPlatformId = strUrl.split("/").at(0).split(".").at(0);
|
||
|
||
/*
|
||
QString strQuery = QString("insert into ");
|
||
strQuery += m_strTable;
|
||
strQuery += QString(" set platform_name='naver',platform_form='blog',article_form='body',article_url='%1',platform_id='%2',platform_title='%3',keyword_id='%4'").arg("http://"+strUrl).arg(strPlatformId).arg(str).arg(m_strKeywordID);
|
||
QString strUtf8(strQuery.toUtf8());
|
||
if (sql.exec(strUtf8) == false)
|
||
cout << "error : " << sql.lastError().text().toStdString();
|
||
else
|
||
*/
|
||
cout << "o ";
|
||
}
|
||
//else
|
||
// cout << "v ";
|
||
cout << "http://" << strUrl.toStdString() << endl;
|
||
m_bUse = true;
|
||
}
|
||
|
||
{
|
||
QWebElement total = Find(eleMain,"span","class","title_num");
|
||
if (total.toPlainText().isEmpty()) {m_bError = true; return true;}
|
||
int nTotal = GetNumber(total.toPlainText().split("/").at(1));
|
||
QStringList strList = m_strUrl.split("&");
|
||
bool ok = false;
|
||
int nNow = GetNumber(strList.at(strList.size() - 1).split("=").at(1), ok);
|
||
if (!ok)
|
||
{
|
||
m_bError = true;
|
||
return true;
|
||
}
|
||
if ((nNow + 10) > nTotal || nNow >= 1000)
|
||
m_bLast = true;
|
||
}
|
||
return true;
|
||
}
|
||
|
||
enum E_DATA
|
||
{
|
||
E_DATA_NICK=0,
|
||
E_DATA_ID,
|
||
E_DATA_TITLE,
|
||
E_DATA_DATE,
|
||
E_DATA_DATA,
|
||
E_DATA_PLATFORM_TITLE,
|
||
E_DATA_MAX,
|
||
};
|
||
|
||
bool SCrawler::saveFrameUrl(QWebFrame *frame)
|
||
{
|
||
//static int cz = 0;
|
||
//Debug(frame->frameName() + QString::number(cz++) + ".html",frame->toHtml());
|
||
//QSqlQuery sql;
|
||
|
||
if (frame->frameName().compare(QString("BuddyConnectIframe")) == 0)
|
||
{
|
||
QWebElement profile = Find(frame->documentElement(),"div","class","profile_name");
|
||
QString str = profile.toPlainText().split("\n").at(0);
|
||
if (str.isEmpty() == false)
|
||
bodydata.setData(m_strUrl, bodydata.ARTICLE_URL);
|
||
}
|
||
if (frame->frameName().compare(QString("mainFrame")) == 0)
|
||
{
|
||
QString str[E_DATA_MAX];
|
||
QString sympathy;
|
||
QString numofReply;
|
||
QString strProfile;
|
||
QWebElement proTitle = Find(frame->documentElement(),"meta","property","og:article:author");
|
||
str[E_DATA_PLATFORM_TITLE] = proTitle.attribute("content").split("|").at(1).trimmed();
|
||
if(str[E_DATA_PLATFORM_TITLE].length() > 0)
|
||
str[E_DATA_PLATFORM_TITLE] = GetSafeUtf(str[E_DATA_PLATFORM_TITLE]);
|
||
else
|
||
{
|
||
proTitle = Find(frame->documentElement(),"span","id","blogTitleName");
|
||
str[E_DATA_PLATFORM_TITLE] = GetSafeUtf(proTitle.toPlainText().trimmed());
|
||
}
|
||
|
||
QWebElement image;
|
||
QWebElement profile = Find(frame->documentElement(),"div","id","blog-profile");
|
||
{
|
||
QWebElement nick = Find(profile,"strong","id","nickNameArea");
|
||
if (nick.toPlainText().isEmpty()==false)
|
||
str[E_DATA_NICK] = nick.toPlainText();
|
||
|
||
if(str[E_DATA_NICK].isEmpty())
|
||
{
|
||
QString strHtml = frame->toHtml();
|
||
QString strFind = "var nickName = '";
|
||
int start = strHtml.indexOf(strFind);
|
||
if (start == -1)
|
||
{
|
||
cout << "error : nick name can not find and next again connect." << endl;
|
||
}
|
||
if (strHtml.at(start + strFind.length()) == QChar('\''))
|
||
{
|
||
cout << "error : nick name can not find and next again connect." << endl;
|
||
}
|
||
else
|
||
{
|
||
int end = strHtml.indexOf("'",start + strFind.length());
|
||
str[E_DATA_NICK] = strHtml.mid(start + strFind.length(),end-start-strFind.length());
|
||
}
|
||
}
|
||
str[E_DATA_NICK] = GetSafeUtf(str[E_DATA_NICK]);
|
||
if (m_strUrl.split("/").at(3).trimmed() == str[E_DATA_NICK].trimmed())
|
||
{
|
||
str[E_DATA_ID] = str[E_DATA_NICK];
|
||
}
|
||
else
|
||
{
|
||
if (str[E_DATA_ID].isEmpty())
|
||
{
|
||
if((m_strUrl.split("/").at(2).compare("blog.naver.com") == 0))
|
||
str[E_DATA_ID] = m_strUrl.split("/").at(3);
|
||
else
|
||
str[E_DATA_ID] = m_strUrl.split("/").at(2).split(".").at(0);
|
||
}
|
||
}
|
||
if(str[E_DATA_NICK].length() == 0)
|
||
str[E_DATA_NICK] = str[E_DATA_ID];
|
||
|
||
image = Find(profile,"img","alt","프로필 이미지");
|
||
strProfile = Find(profile, "p", "class", "caption align").toPlainText().trimmed();
|
||
}
|
||
{
|
||
QWebElement post = Find(frame->documentElement(),"div","id","postListBody");
|
||
{
|
||
QWebElement weCmt = post.findFirst("a[class^='pcol2 _cmtList']");
|
||
if (!weCmt.isNull())
|
||
{
|
||
numofReply = weCmt.toPlainText().replace(",", "").trimmed();
|
||
numofReply = numofReply.replace(QRegExp("[\\D]"), "");
|
||
}
|
||
}
|
||
|
||
QWebElement post_top = Find(post,"table","class","post-top");
|
||
{
|
||
QWebElement title = Find(post_top,"span","class","pcol1 itemSubjectBoldfont");
|
||
if(title.isNull())
|
||
{
|
||
title = Find(frame->documentElement(), "div", "class", "se_textView");
|
||
}
|
||
if(title.isNull())
|
||
{
|
||
title = Find(frame->documentElement(), "h3", "class", "se_textarea");
|
||
}
|
||
if (title.toPlainText().isEmpty()==false)
|
||
{
|
||
str[E_DATA_TITLE] = title.toPlainText();
|
||
str[E_DATA_TITLE] = GetSafeUtf(str[E_DATA_TITLE]);
|
||
}
|
||
}
|
||
|
||
|
||
{
|
||
QWebElement date = Find(post_top,"p","class","date fil5 pcol2 _postAddDate");
|
||
if(date.isNull())
|
||
{
|
||
date = Find(frame->documentElement(), "span","class","se_publishDate pcol2 fil5");
|
||
}
|
||
str[E_DATA_DATE] = date.toPlainText().trimmed().replace("/","-");
|
||
if ( str[E_DATA_DATE].isEmpty() == false)
|
||
{
|
||
str[E_DATA_DATE] += ":00";
|
||
}
|
||
else
|
||
{
|
||
UpdateError("Error code 4");
|
||
m_bUse = false;
|
||
}
|
||
}
|
||
|
||
{
|
||
//QWebElement body = Find(post,"div","class","post-view pcol2 _param(1)");
|
||
QWebElement body = post.findFirst("div[class^='post-view pcol2 _param(1)']");
|
||
if(body.isNull())
|
||
body = post.findFirst("div[class*='pcol2 _param(1)']");
|
||
if(body.isNull())
|
||
body = Find(post, "class", "se_component_wrap sect_dsc __se_component_area");
|
||
if (body.toPlainText().isEmpty()==false)
|
||
{
|
||
str[E_DATA_DATA] = body.toPlainText();
|
||
str[E_DATA_DATA] = GetSafeUtf(str[E_DATA_DATA]);
|
||
}
|
||
}
|
||
{
|
||
|
||
QWebElement WEsympathy = Find(frame->documentElement(),"div","class","btn_like pcol2");
|
||
|
||
if(WEsympathy.isNull())
|
||
{
|
||
sympathy = "0";
|
||
}
|
||
else
|
||
{
|
||
sympathy = WEsympathy.toPlainText().trimmed();
|
||
}
|
||
//qDebug() << "Sympathy: " << sympathy;
|
||
//qDebug() << strProfile;
|
||
|
||
}
|
||
{
|
||
|
||
|
||
//retry if profile is empty and sympathy is empty
|
||
if((strProfile.isEmpty() || sympathy.isEmpty()) && (m_nRetryCount < RETRY_MAX))
|
||
{
|
||
m_nRetryCount++;
|
||
qDebug() << m_nRetryCount;
|
||
QTimer::singleShot(RETRY_INTERVAL, this, SLOT(reloadPage()));
|
||
return false;
|
||
}
|
||
}
|
||
}
|
||
|
||
|
||
|
||
//QString strHead[E_DATA_MAX] = {"article_nickname","article_id","article_title","article_date","article_data","platform_title"};
|
||
bodydata.setData(str[0].trimmed(), bodydata.ARTICLE_NICKNAME);
|
||
bodydata.setData(str[1].trimmed(), bodydata.ARTICLE_ID);
|
||
bodydata.setData(str[2].trimmed(), bodydata.ARTICLE_TITLE);
|
||
bodydata.setData(str[3].trimmed(), bodydata.ARTICLE_DATE);
|
||
bodydata.setData(str[4].trimmed(), bodydata.ARTICLE_DATA);
|
||
bodydata.setData(str[5].trimmed(), bodydata.PLATFORM_TITLE);
|
||
if(image.attribute("src").trimmed().length() != 0)
|
||
{
|
||
bodydata.setData(image.attribute("src").trimmed(), bodydata.ARTICLE_PROFILEURL);
|
||
}
|
||
strProfile = GetSafeUtf(strProfile);
|
||
if(strProfile.length() > 0)
|
||
{
|
||
bodydata.setData(strProfile, bodydata.ARTICLE_PROFILE);
|
||
}
|
||
|
||
bodydata.setData(m_strUrl, bodydata.ARTICLE_URL);
|
||
bodydata.setTable(m_strTable);
|
||
|
||
|
||
|
||
//bodydata.setData(sympathy, bodydata.ARTICLE_HIT); //original data
|
||
|
||
bodydata.setData(numofReply, bodydata.ARTICLE_ORDER);
|
||
bodydata.setData(sympathy, bodydata.REPLY_URL);
|
||
|
||
bodydata.setData("naver", bodydata.PLATFORM_NAME);
|
||
bodydata.setData("blog", bodydata.PLATFORM_FORM);
|
||
bodydata.setData("body", bodydata.ARTICLE_FORM);
|
||
bodydata.setData(m_strUrl.split("/").at(3), bodydata.PLATFORM_ID);
|
||
bodydata.setData(m_strKeywordID, bodydata.KEYWORD_ID);
|
||
|
||
}
|
||
|
||
bool b_ok = true;
|
||
foreach(QWebFrame *childFrame, frame->childFrames())
|
||
b_ok = (b_ok && saveFrameUrl(childFrame));
|
||
|
||
return b_ok;
|
||
}
|
||
|
||
void SCrawler::reloadPage()
|
||
{
|
||
//qDebug() << "reloadPage called";
|
||
saveResult(true);
|
||
}
|
||
|
||
void SCrawler::saveFrameComment(QWebFrame *frame)
|
||
{
|
||
QWebElement group = Find(frame->documentElement(),"ul","id","commentList");
|
||
QWebElementCollection elements = group.findAll("li");
|
||
QString strParent,strDate,strNick,strComm,strUrl,strId;
|
||
QStringList strList = m_strUrl.split("/");
|
||
QString strCommUrl;
|
||
for (int i=0; i < strList.size() - 1; i++)
|
||
strUrl += strList.at(i) + "/";
|
||
|
||
{
|
||
int nStartIdPos = m_strUrl.indexOf("blogId=") + QString("blogId=").size();
|
||
int nEndIdPos = m_strUrl.indexOf('&', nStartIdPos);
|
||
strUrl += m_strUrl.mid(nStartIdPos, nEndIdPos - nStartIdPos) + '/';
|
||
|
||
}
|
||
|
||
{
|
||
int nStartIdPos = m_strUrl.indexOf("logNo=") + QString("logNo=").size();
|
||
int nEndIdPos = m_strUrl.indexOf('&', nStartIdPos);
|
||
strUrl += m_strUrl.mid(nStartIdPos, nEndIdPos - nStartIdPos);
|
||
}
|
||
|
||
int nCount=0;
|
||
foreach (QWebElement element, elements)
|
||
{
|
||
if (element.attribute("class") == "_countableComment ")
|
||
{
|
||
strNick = strParent = GetSafeUtf(Find(element,"a","class","nick pcol2").toPlainText());
|
||
strDate = Find(element,"span","class","date fil5 pcol2").toPlainText();
|
||
strComm = Find(element,"dd","class","comm pcol2").toPlainText();
|
||
|
||
strCommUrl = Find(element,"a","class","nick pcol2").attribute("href");
|
||
if(strCommUrl.left(QString("http://blog.naver.com").length()).compare("http://blog.naver.com") == 0)
|
||
strId = strCommUrl.split("/").at(3).trimmed();
|
||
if(strCommUrl.right(QString("blog.me").length()).compare("blog.me") == 0)
|
||
strId = strCommUrl.split("/").at(2).split(".").at(0).trimmed();
|
||
if(strCommUrl.left(1) == "/")
|
||
{
|
||
QStringList strList = strCommUrl.split("&");
|
||
foreach(QString str, strList)
|
||
{
|
||
if(str.left(3) == "id=")
|
||
{
|
||
strId = str.right(str.length() - 3);
|
||
}
|
||
}
|
||
}
|
||
|
||
strComm = GetSafeUtf(strComm);
|
||
if (strComm.isEmpty()== false)
|
||
{
|
||
strComm.replace("'","\\'");
|
||
strComm.replace("\"","\\\"");
|
||
strComm = strComm.trimmed();
|
||
QSqlQuery query;
|
||
if(strId.length() > 0)
|
||
{
|
||
query.prepare(QString("insert into " + m_strTable + " (article_id,article_url,platform_id,article_nickname,article_data,article_date,reply_url,article_order,platform_name,platform_form,article_form) VALUES (:ID,:URL,:PLATFORMID,:NICK,:DATA,:DATE,:URLREPLY,:ROWNUM,'naver','blog','reply')").toUtf8());
|
||
query.bindValue(":ID", strId.toUtf8());
|
||
}
|
||
else
|
||
query.prepare(QString("insert into " + m_strTable + " (article_url,platform_id,article_nickname,article_data,article_date,reply_url,article_order,platform_name,platform_form,article_form) VALUES (:URL,:PLATFORMID,:NICK,:DATA,:DATE,:URLREPLY,:ROWNUM,'naver','blog','reply')").toUtf8());
|
||
|
||
|
||
int nStartIdPos = m_strUrl.indexOf("blogId=") + QString("blogId=").size();
|
||
int nEndIdPos = m_strUrl.indexOf('&', nStartIdPos);
|
||
|
||
query.bindValue(":URL", strUrl.toUtf8());
|
||
query.bindValue(":PLATFORMID",m_strUrl.mid(nStartIdPos, nEndIdPos - nStartIdPos));
|
||
query.bindValue(":NICK",strNick.toUtf8());
|
||
query.bindValue(":DATA",strComm.toUtf8());
|
||
query.bindValue(":DATE",strDate.toUtf8());
|
||
query.bindValue(":URLREPLY",m_strUrl.toUtf8());
|
||
query.bindValue(":ROWNUM",(nCount++));
|
||
|
||
if (query.exec()==false)
|
||
cout << "error : " << query.lastError().text().toStdString();
|
||
}
|
||
}
|
||
if (element.attribute("class") == "reply _countableComment ")
|
||
{
|
||
strNick = Find(element,"a","class","nick pcol2").toPlainText();
|
||
strDate = Find(element,"span","class","date fil5 pcol2").toPlainText();
|
||
QWebElement subElement = Find(element,"dd","class","comm pcol2");
|
||
QString subNick = Find(subElement,"a","class","nick pcol2").toPlainText();
|
||
strComm = subElement.toPlainText();
|
||
|
||
strCommUrl = Find(element,"a","class","nick pcol2").attribute("href");
|
||
if(strCommUrl.left(QString("http://blog.naver.com").length()).compare("http://blog.naver.com") == 0)
|
||
strId = strCommUrl.split("/").at(3).trimmed();
|
||
if(strCommUrl.right(QString("blog.me").length()).compare("blog.me") == 0)
|
||
strId = strCommUrl.split("/").at(2).split(".").at(0).trimmed();
|
||
if(strCommUrl.left(1) == "/")
|
||
{
|
||
QStringList strList = strCommUrl.split("&");
|
||
foreach(QString str, strList)
|
||
{
|
||
if(str.left(3) == "id=")
|
||
{
|
||
strId = str.right(str.length() - 3);
|
||
}
|
||
}
|
||
}
|
||
|
||
if(subNick.isEmpty() == false)
|
||
strComm = strComm.right(strComm.size()-subNick.size()-1);
|
||
|
||
if (strComm.isEmpty() == false)
|
||
{
|
||
strComm = GetSafeUtf(strComm);
|
||
strComm.replace("'","\\'");
|
||
strComm.replace("\"","\\\"");
|
||
strComm = strComm.trimmed();
|
||
QSqlQuery query;
|
||
if(strId.length() > 0)
|
||
{
|
||
query.prepare(QString("insert into " + m_strTable + " (article_id,article_url,platform_id,article_nickname,article_data,article_date,article_parent,reply_url,article_order,platform_name,platform_form,article_form) VALUES (:ID,:URL,:PLATFORMID,:NICK,:DATA,:DATE,:PARENT,:URLREPLY,:ROWNUM,'naver','blog','reply')").toUtf8());
|
||
query.bindValue(":ID", strId.toUtf8());
|
||
}
|
||
else
|
||
query.prepare(QString("insert into " + m_strTable + " (article_url,platform_id,article_nickname,article_data,article_date,article_parent,reply_url,article_order,platform_name,platform_form,article_form) VALUES (:URL,:PLATFORMID,:NICK,:DATA,:DATE,:PARENT,:URLREPLY,:ROWNUM,'naver','blog','reply')").toUtf8());
|
||
|
||
int nStartIdPos = m_strUrl.indexOf("blogId=") + QString("blogId=").size();
|
||
int nEndIdPos = m_strUrl.indexOf('&', nStartIdPos);
|
||
|
||
query.bindValue(":URL",strUrl.toUtf8());
|
||
query.bindValue(":PLATFORMID",m_strUrl.mid(nStartIdPos, nEndIdPos - nStartIdPos));
|
||
query.bindValue(":NICK",strNick.toUtf8());
|
||
query.bindValue(":DATA",strComm.toUtf8());
|
||
query.bindValue(":DATE",strDate.toUtf8());
|
||
query.bindValue(":PARENT",strParent.toUtf8());
|
||
query.bindValue(":URLREPLY",m_strUrl.toUtf8());
|
||
query.bindValue(":ROWNUM",(nCount++));
|
||
if (query.exec()==false)
|
||
{
|
||
cout << "error : " << query.lastError().text().toStdString();
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
void SCrawler::saveFrameCafeList(QWebFrame *frame)
|
||
{
|
||
if (m_bUse == true) return;
|
||
|
||
static int cz = 0;
|
||
//Debug(frame->frameName() + QString::number(cz++) + ".html",frame->toHtml());
|
||
|
||
QWebElement notFound = Find(frame->documentElement(),"div","id","notfound");
|
||
if(notFound.isNull() == false)
|
||
{
|
||
m_bLast = true;
|
||
return;
|
||
}
|
||
|
||
QWebElement naverBlock = Find(frame->documentElement(), "p", "class", "info01");
|
||
if (!naverBlock.isNull())
|
||
{
|
||
m_bError = true;
|
||
cout << "naver";
|
||
return;
|
||
}
|
||
|
||
QStringList urlList;
|
||
|
||
QWebElement eleMain = Find(frame->documentElement(),"div","class","cafe_article section _cafeBase");
|
||
foreach(QWebElement eleSub,eleMain.findAll("li"))
|
||
{
|
||
if (eleSub.attribute("class") == "sh_cafe_top")
|
||
{
|
||
QString strUrl;
|
||
foreach(QWebElement eleSubUrl,eleSub.findAll("a"))
|
||
{
|
||
if (eleSubUrl.attribute("class") == "url")
|
||
strUrl = eleSubUrl.attribute("href");
|
||
}
|
||
|
||
if (strUrl.split("/").at(2) == "cafe.naver.com")
|
||
{
|
||
urlList << strUrl;
|
||
}
|
||
}
|
||
}
|
||
|
||
if(urlList.size() > 0)
|
||
{
|
||
QString strUrlList;
|
||
strUrlList = "(";
|
||
foreach(QString str, urlList)
|
||
{
|
||
strUrlList += "'";
|
||
strUrlList += str;
|
||
strUrlList += "',";
|
||
}
|
||
strUrlList = strUrlList.left(strUrlList.size() - 1);
|
||
strUrlList += ")";
|
||
|
||
QSqlQuery sql;
|
||
|
||
QString strQuery = "delete from ";
|
||
strQuery += m_strTable;
|
||
strQuery += QString(" where article_url in %1").arg(strUrlList);
|
||
//qDebug() << strQuery;
|
||
|
||
if (sql.exec(strQuery.toUtf8()) == false)
|
||
{
|
||
cout << "error " << sql.lastError().text().toStdString();
|
||
cout << strQuery.toStdString();
|
||
}
|
||
}
|
||
|
||
|
||
|
||
foreach(QWebElement eleSub,eleMain.findAll("li"))
|
||
{
|
||
if (eleSub.attribute("class") == "sh_cafe_top")
|
||
{
|
||
QString strUrl,strTitle;
|
||
foreach(QWebElement eleSubUrl,eleSub.findAll("a"))
|
||
{
|
||
if (eleSubUrl.attribute("class") == "url")
|
||
strUrl = eleSubUrl.attribute("href");
|
||
|
||
if (eleSubUrl.attribute("class") == "sh_cafe_title")
|
||
strTitle = GetSafeUtf(SqlString(eleSubUrl.toPlainText().trimmed()));
|
||
}
|
||
|
||
if (strUrl.split("/").at(2) == "cafe.naver.com")
|
||
cout << "o " << strUrl.toStdString() << endl;
|
||
m_bUse = true;
|
||
}
|
||
}
|
||
|
||
|
||
{
|
||
QWebElement total = Find(eleMain,"span","class","title_num");
|
||
if (total.toPlainText().isEmpty()) {m_bError = true; return;}
|
||
total.toPlainText().split("/").size();
|
||
int nTotal = GetNumber(total.toPlainText().split("/").at(1));
|
||
QStringList strList = m_strUrl.split("&");
|
||
int nNow = GetNumber(strList.at(strList.size() - 1).split("=").at(1));
|
||
if ((nNow + 10) > nTotal || nNow >= 1000)
|
||
m_bLast = true;
|
||
}
|
||
}
|
||
|
||
void SCrawler::saveFrameCafeUrl(QWebFrame *frame)
|
||
{
|
||
if (m_bUse) return;
|
||
|
||
QWebElement other = Find(frame->documentElement(),"h1","class","d-none");
|
||
if (other.toPlainText().isEmpty() == false)
|
||
{
|
||
bodydata.setData(SqlString(GetSafeUtf(other.toPlainText())), bodydata.PLATFORM_TITLE);
|
||
}
|
||
|
||
|
||
if (frame->frameName() == "cafe_main")
|
||
{
|
||
{
|
||
QString strData,strDate,strNick,strID,strHits,strTitle,strReply,strLike;
|
||
{
|
||
QWebElement group = Find(frame->documentElement(),"div","class","tbody m-tcol-c");
|
||
strData = SqlString(group.toPlainText().trimmed());
|
||
strData = GetSafeUtf(strData);
|
||
}
|
||
{
|
||
QWebElement group = Find(frame->documentElement(),"td","class","m-tcol-c date");
|
||
strDate = group.toPlainText().trimmed().replace(".","-");
|
||
if (strDate.isEmpty() == true)
|
||
{
|
||
QWebElement subgroup = Find(frame->documentElement(),"em","class","date m-tcol-c");
|
||
strDate = subgroup.toPlainText().trimmed().replace(".","-");
|
||
strDate += " 00:00:00";
|
||
}
|
||
else
|
||
strDate += ":00";
|
||
}
|
||
{
|
||
QWebElement group = Find(frame->documentElement(),"span","class","b m-tcol-c");
|
||
strTitle = SqlString(group.toPlainText().trimmed());
|
||
}
|
||
|
||
{
|
||
QWebElement group = Find(Find(frame->documentElement(),"div","class","etc-box"),"td","class","p-nick");
|
||
strNick = group.toPlainText().trimmed();
|
||
|
||
if (strNick.isEmpty() == false)
|
||
{
|
||
QStringList list = strNick.split("(");
|
||
if (list.isEmpty() == false)
|
||
strNick = list.at(0);
|
||
QWebElement id = Find(frame->documentElement(),"td","class","m-tcol-c b nick");
|
||
list = Find(id,"a","class","m-tcol-c b").attribute("onclick").trimmed().split(",");
|
||
if (list.size() >= 2)
|
||
strID = list.at(1).trimmed().replace("'","");
|
||
}
|
||
else
|
||
{
|
||
QStringList list = Find(Find(frame->documentElement(),"td","class","m-tcol-c b nick"),"a","class","m-tcol-c b").attribute("onclick").trimmed().split(",");
|
||
if (list.size() >= 4)
|
||
{
|
||
strID = list.at(1).trimmed().replace("'","");
|
||
strNick = list.at(3).trimmed().replace("'","");
|
||
}
|
||
}
|
||
|
||
}
|
||
strHits = Find(frame->documentElement(),"span","class","b m-tcol-c reply _rosReadcount").toPlainText();
|
||
{
|
||
strReply = frame->documentElement().findFirst("td.reply").toPlainText().replace(",", "").trimmed();
|
||
strReply = strReply.replace(QRegExp("[\\D]"), "");
|
||
}
|
||
{
|
||
strLike = frame->documentElement().findFirst("a#upArticleLink").toPlainText().replace(",", "").trimmed();
|
||
}
|
||
|
||
|
||
if (strHits.isEmpty())
|
||
{
|
||
strHits = Find(frame->documentElement(),"span","class","kin_count m-tcol-c _rosReadcount").toPlainText();
|
||
}
|
||
{
|
||
|
||
bodydata.setTable(m_strTable);
|
||
bodydata.setData(strData, bodydata.ARTICLE_DATA);
|
||
bodydata.setData(strDate, bodydata.ARTICLE_DATE);
|
||
bodydata.setData(strNick, bodydata.ARTICLE_NICKNAME);
|
||
bodydata.setData(strID, bodydata.ARTICLE_ID);
|
||
bodydata.setData(strHits, bodydata.ARTICLE_HIT);
|
||
bodydata.setData(m_strUrl, bodydata.ARTICLE_URL);
|
||
bodydata.setData("naver", bodydata.PLATFORM_NAME);
|
||
bodydata.setData("cafe", bodydata.PLATFORM_FORM);
|
||
bodydata.setData("body", bodydata.ARTICLE_FORM);
|
||
bodydata.setData(m_strUrl.split("/").at(3), bodydata.PLATFORM_ID);
|
||
bodydata.setData(m_strKeywordID, bodydata.KEYWORD_ID);
|
||
bodydata.setData(strReply, bodydata.ARTICLE_ORDER);
|
||
bodydata.setData(strTitle, bodydata.ARTICLE_TITLE);
|
||
bodydata.setData(strLike, bodydata.REPLY_URL);
|
||
/*
|
||
QSqlQuery sql;
|
||
QString strQuery = "update ";
|
||
strQuery += m_strTable;
|
||
strQuery += " set ";
|
||
strQuery += "article_data = '" + strData + "',";
|
||
strQuery += "article_date = '" + strDate + "',";
|
||
strQuery += "article_nickname = '" + strNick + "',";
|
||
strQuery += "article_id = '" + strID + "',";
|
||
strQuery += "article_hit = '" + strHits + "'";
|
||
strQuery += "where article_url='";
|
||
strQuery += m_strUrl;
|
||
strQuery += "'";
|
||
QString strUtf8(strQuery.toUtf8());
|
||
if (sql.exec(strUtf8) == false)
|
||
cout << "error : " << sql.lastError().text().toStdString();
|
||
*/
|
||
|
||
}
|
||
}
|
||
// Comment
|
||
{
|
||
QWebElement group = Find(frame->documentElement(),"ul","id","cmt_list");
|
||
QWebElementCollection elements = group.findAll("li");
|
||
QString strParent;
|
||
int nCount = 0;
|
||
foreach (QWebElement element, elements)
|
||
{
|
||
if (element.attribute("class").isEmpty())
|
||
{
|
||
QString strData = SqlString(Find(element,"span","class","comm_body").toPlainText().trimmed());
|
||
if (strData.isEmpty()) continue;
|
||
strData = GetSafeUtf(strData);
|
||
QString strID = Find(element,"input","name","writerid").attribute("value").trimmed();
|
||
QString strNick = Find(element,"td","class","p-nick").toPlainText().trimmed();
|
||
strParent = strNick;
|
||
QString strDate = Find(element,"span","class","date m-tcol-c filter-50").toPlainText().trimmed();
|
||
if (strDate.isEmpty()) continue;
|
||
|
||
QSqlQuery query;
|
||
query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,platform_id,article_id,article_nickname,article_data,article_date,reply_url,article_order) VALUES ('naver','cafe','reply',:URL,:PLATFORMID,:ID,:NICK,:DATA,:DATE,:URLREPLY,:ROWNUM)").toUtf8());
|
||
query.bindValue(":URL",m_strUrl.toUtf8());
|
||
query.bindValue(":PLATFORMID",m_strUrl.split("/").at(3).toUtf8());
|
||
query.bindValue(":ID",strID.toUtf8());
|
||
query.bindValue(":NICK",strNick.toUtf8());
|
||
query.bindValue(":DATA",strData.toUtf8());
|
||
query.bindValue(":DATE",strDate.toUtf8());
|
||
query.bindValue(":URLREPLY",m_strReper.toUtf8());
|
||
query.bindValue(":ROWNUM",nCount++);
|
||
|
||
if (query.exec()==false)
|
||
cout << "error : " << query.lastError().text().toStdString();
|
||
}
|
||
if (element.attribute("class") == "reply")
|
||
{
|
||
QString strData = Find(element,"span","class","comm_body").toPlainText().trimmed();
|
||
if (strData.isEmpty()) continue;
|
||
QString strReParent = strParent;
|
||
QString strID = Find(element,"input","name","writerid").attribute("value").trimmed();
|
||
QString strNick = Find(element,"td","class","p-nick").toPlainText().trimmed();
|
||
QString strDate = Find(element,"span","class","date m-tcol-c filter-50").toPlainText().trimmed();
|
||
QWebElement eleParent = Find(element,"span","class","re-p-nick");
|
||
if (eleParent.toPlainText().isEmpty() == false)
|
||
strReParent = eleParent.toPlainText();
|
||
QSqlQuery query;
|
||
query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,platform_id,article_id,article_nickname,article_data,article_date,article_parent,reply_url,article_order) VALUES ('naver','cafe','reply',:URL,:PLATFORMID,:ID,:NICK,:DATA,:DATE,:PARENT,:URLREPLY,:ROWNUM)").toUtf8());
|
||
query.bindValue(":URL",m_strUrl.toUtf8());
|
||
query.bindValue(":PLATFORMID",m_strUrl.split("/").at(3).toUtf8());
|
||
query.bindValue(":ID",strID.toUtf8());
|
||
query.bindValue(":NICK",strNick.toUtf8());
|
||
query.bindValue(":DATA",strData.toUtf8());
|
||
query.bindValue(":DATE",strDate.toUtf8());
|
||
query.bindValue(":PARENT",strReParent.toUtf8());
|
||
query.bindValue(":URLREPLY",m_strReper.toUtf8());
|
||
query.bindValue(":ROWNUM",nCount++);
|
||
|
||
if (query.exec()==false)
|
||
cout << "error : " << query.lastError().text().toStdString();
|
||
}
|
||
}
|
||
}
|
||
m_bUse = true;
|
||
}
|
||
foreach(QWebFrame *childFrame, frame->childFrames())
|
||
saveFrameCafeUrl(childFrame);
|
||
}
|
||
|
||
|
||
void SCrawler::saveFrameDaumCafeList(QWebFrame *frame)
|
||
{
|
||
if (m_bUse == true) return;
|
||
|
||
///static int cz = 0;
|
||
// Debug(frame->frameName() + QString::number(cz++) + ".html",frame->toHtml());
|
||
//int nLast = 0;
|
||
QStringList urlList;
|
||
QWebElement eleMain = Find(frame->documentElement(),"div","class","type_fulltext wid_f");
|
||
foreach(QWebElement eleSub,eleMain.findAll("div"))
|
||
{
|
||
if (eleSub.attribute("class") == "wrap_cont")
|
||
{
|
||
//nLast++;
|
||
QString strUrl;
|
||
foreach(QWebElement eleSubUrl,eleSub.findAll("a"))
|
||
{
|
||
if (eleSubUrl.attribute("class") == "f_url")
|
||
strUrl = eleSubUrl.attribute("href");
|
||
}
|
||
|
||
if (strUrl.split("/").at(2) == "cafe.daum.net")
|
||
{
|
||
urlList << strUrl;
|
||
}
|
||
}
|
||
}
|
||
|
||
|
||
if(urlList.size() > 0)
|
||
{
|
||
QString strUrlList;
|
||
strUrlList = "(";
|
||
foreach(QString str, urlList)
|
||
{
|
||
strUrlList += "'";
|
||
QStringList strlist = str.split("?");
|
||
if(strlist.size() > 1)
|
||
strUrlList += strlist.at(0).trimmed();
|
||
else
|
||
strUrlList += str;
|
||
strUrlList += "',";
|
||
}
|
||
strUrlList = strUrlList.left(strUrlList.size() - 1);
|
||
strUrlList += ")";
|
||
|
||
QSqlQuery sql;
|
||
|
||
QString strQuery = "delete from ";
|
||
strQuery += m_strTable;
|
||
strQuery += QString(" where article_url in %1").arg(strUrlList);
|
||
// qDebug() << strQuery;
|
||
|
||
if (sql.exec(strQuery.toUtf8()) == false)
|
||
{
|
||
cout << "error " << sql.lastError().text().toStdString();
|
||
cout << strQuery.toStdString();
|
||
}
|
||
}
|
||
|
||
foreach(QWebElement eleSub,eleMain.findAll("div"))
|
||
{
|
||
if (eleSub.attribute("class") == "wrap_cont")
|
||
{
|
||
//nLast++;
|
||
QString strUrl,strTitle;
|
||
foreach(QWebElement eleSubUrl,eleSub.findAll("a"))
|
||
{
|
||
if (eleSubUrl.attribute("class") == "f_url")
|
||
strUrl = eleSubUrl.attribute("href");
|
||
|
||
if (eleSubUrl.attribute("class") == "f_link_bu f_l")
|
||
strTitle = GetSafeUtf(SqlString(eleSubUrl.toPlainText().trimmed()));
|
||
}
|
||
|
||
if (strUrl.split("/").at(2) == "cafe.daum.net")
|
||
{
|
||
//QSqlQuery sql;
|
||
|
||
{
|
||
/*
|
||
QString strQuery = QString("insert into ");
|
||
strQuery += m_strTable;
|
||
strQuery += QString(" set platform_name='daum',platform_form='cafe',article_form='body',article_url='%1',platform_id='%2',article_title='%3',keyword_id='%4'").arg(strUrl).arg(strUrl.split("/").at(3)).arg(strTitle).arg(m_strKeywordID);
|
||
QString strUtf8(strQuery.toUtf8());
|
||
if (sql.exec(strUtf8) == false)
|
||
cout << "x " << sql.lastError().text().toStdString();
|
||
else
|
||
*/
|
||
cout << "o " << strUrl.toStdString() << endl;
|
||
}
|
||
//else
|
||
// cout << "v " << strUrl.toStdString() << endl;
|
||
}
|
||
m_bUse = true;
|
||
}
|
||
}
|
||
|
||
|
||
{
|
||
/*
|
||
QWebElement noResult = Find(frame->documentElement(),"div","id","noResult");
|
||
if(!noResult.isNull())
|
||
m_bLast = true;
|
||
*/
|
||
if(eleMain.isNull())
|
||
m_bLast = true;
|
||
}
|
||
|
||
|
||
{
|
||
QWebElement noResult = Find(frame->documentElement(),"div","id","noResult");
|
||
if(!noResult.isNull())
|
||
{
|
||
m_bLast = true;
|
||
return;
|
||
}
|
||
}
|
||
|
||
|
||
{
|
||
bool b_last = false;
|
||
|
||
b_last = Find(frame->documentElement(), "div", "class", "result_message mg_cont hide").isNull();
|
||
b_last = b_last | !(Find(frame->documentElement(), "div", "class", "result_message mg_cont").isNull());
|
||
|
||
QWebElement total = Find(eleMain,"span","class","f_nb f_l");
|
||
if (total.toPlainText().isEmpty()) {m_bError = true; return;}
|
||
|
||
QString strTotal = total.toPlainText().split("/").at(1);
|
||
strTotal = strTotal.replace(",","");
|
||
QRegExp rx("(\\d+)");
|
||
int pos = 0;
|
||
QList<QString> list;
|
||
while ((pos = rx.indexIn(strTotal, pos)) != -1)
|
||
{
|
||
list << rx.cap(1);
|
||
pos += rx.matchedLength();
|
||
}
|
||
|
||
int nTotal = list.at(0).toInt();
|
||
|
||
QStringList strList = total.toPlainText().split("/").at(0).trimmed().split("-");
|
||
int nNow = GetNumber(strList.at(strList.size() - 1));
|
||
int nNowFirst = GetNumber(strList.at(strList.size() - 2));
|
||
if (nNow >= 1000 || nNow >= nTotal || (nNow - nNowFirst) < 9 || b_last)
|
||
m_bLast = true;
|
||
//cout << "nNow : " << nNow << endl << "nNow - nNowFirst: " << (nNow - nNowFirst) << endl << "b_last : " << b_last << endl;
|
||
}
|
||
}
|
||
|
||
|
||
void SCrawler::saveFrameDaumCafeUrl(QWebFrame *frame)
|
||
{
|
||
if (m_bUse) return;
|
||
|
||
QWebElement other = frame->documentElement().findFirst("title");
|
||
QString strTitle = other.toPlainText().trimmed().split("|").at(0).trimmed();
|
||
QString strUrl_;
|
||
if (strTitle.isEmpty() == false)
|
||
{
|
||
|
||
bodydata.setTable(m_strTable);
|
||
QStringList strlist = m_strUrl.split("?");
|
||
if(strlist.size() > 1)
|
||
{
|
||
bodydata.setData(strlist.at(0).trimmed(), bodydata.ARTICLE_URL);
|
||
strUrl_ = strlist.at(0).trimmed();
|
||
}
|
||
else
|
||
{
|
||
bodydata.setData(m_strUrl, bodydata.ARTICLE_URL);
|
||
strUrl_ = m_strUrl;
|
||
}
|
||
bodydata.setData(SqlString(GetSafeUtf(strTitle)), bodydata.PLATFORM_TITLE);
|
||
}
|
||
|
||
if (frame->frameName() == "down")
|
||
{
|
||
QString strHits;
|
||
{
|
||
QString strData,strDate,strNick,strID,strTitle;
|
||
{
|
||
QWebElement group = Find(frame->documentElement(),"div","class","bbs_contents");
|
||
strData = SqlString(group.toPlainText().trimmed());
|
||
strData = GetSafeUtf(strData);
|
||
}
|
||
{
|
||
QWebElement group = Find(frame->documentElement(),"span","class","p11 ls0");
|
||
strDate = group.toPlainText().trimmed().replace(".","-");
|
||
strDate = strDate.replace("- "," ");
|
||
if (strDate.isEmpty() == true)
|
||
strDate = Find(frame->documentElement(),"input","name","PLAIN_REGDT").attribute("value");
|
||
else
|
||
strDate += ":00";
|
||
}
|
||
{
|
||
QWebElement group = Find(frame->documentElement(),"div","class","subject");
|
||
QWebElement group2 = Find(group,"span","class","b");
|
||
strTitle = SqlString(group2.toPlainText().trimmed());
|
||
}
|
||
|
||
{
|
||
QWebElement group = Find(Find(frame->documentElement(),"div","class","article_writer"),"a","href","#");
|
||
strNick = group.toPlainText().trimmed();
|
||
|
||
QWebElement id = Find(frame->documentElement(),"div","class","article_writer");
|
||
QStringList list = FindLeft(id,"a","onclick","showSideView").attribute("onclick").trimmed().split(",");
|
||
if (list.size() >= 2)
|
||
strID = list.at(1).trimmed().replace("'","");
|
||
}
|
||
|
||
QStringList strList = Find(frame->documentElement(),"div","class","article_writer").toPlainText().split("|");
|
||
|
||
foreach(QString str,strList)
|
||
{
|
||
QStringList substrList = str.split(" ");
|
||
for(int i = 0;i < substrList.size();i++)
|
||
{
|
||
if((substrList.at(i).trimmed().compare("조회") == 0) || (substrList.at(i).trimmed().compare("\"조회\"") == 0))
|
||
{
|
||
strHits = substrList.at(i+1).trimmed();
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
{
|
||
bodydata.setTable(m_strTable);
|
||
bodydata.setData(strData, bodydata.ARTICLE_DATA);
|
||
bodydata.setData(strDate, bodydata.ARTICLE_DATE);
|
||
bodydata.setData(strNick, bodydata.ARTICLE_NICKNAME);
|
||
if(!strID.isEmpty())
|
||
bodydata.setData(strID, bodydata.ARTICLE_ID);
|
||
bodydata.setData(strHits, bodydata.ARTICLE_HIT);
|
||
QStringList strlist = m_strUrl.split("?");
|
||
if(strlist.size() > 1)
|
||
{
|
||
bodydata.setData(strlist.at(0).trimmed(), bodydata.ARTICLE_URL);
|
||
strUrl_ = strlist.at(0).trimmed();
|
||
}
|
||
else
|
||
{
|
||
bodydata.setData(m_strUrl, bodydata.ARTICLE_URL);
|
||
strUrl_ = m_strUrl;
|
||
}
|
||
bodydata.setData("daum", bodydata.PLATFORM_NAME);
|
||
bodydata.setData("cafe", bodydata.PLATFORM_FORM);
|
||
bodydata.setData("body", bodydata.ARTICLE_FORM);
|
||
bodydata.setData(m_strUrl.split("/").at(3), bodydata.PLATFORM_ID);
|
||
bodydata.setData(m_strKeywordID, bodydata.KEYWORD_ID);
|
||
bodydata.setData(strTitle, bodydata.ARTICLE_TITLE);
|
||
}
|
||
}
|
||
// Comment
|
||
{
|
||
QWebElement group = Find(frame->documentElement(),"div","class","commentDiv bg_sub");
|
||
QList<QWebElement> elements = FindAllMid(group,"div","id","_cmt-",0,5);
|
||
QString commHidden = "comment_hidden";
|
||
QString commPos = "comment_pos";
|
||
QString commReComm = "recomment_pos";
|
||
QString strParent;
|
||
int nCount = 0;
|
||
foreach (QWebElement element, elements)
|
||
{
|
||
|
||
if (element.attribute("class").trimmed().right(commHidden.length()).compare(commHidden,Qt::CaseInsensitive) != 0){
|
||
if (element.attribute("class").trimmed().left(commPos.length()).compare(commPos,Qt::CaseInsensitive) == 0)
|
||
{
|
||
QString strData = SqlString(Find(element,"span","class","comment_contents").toPlainText().trimmed());
|
||
if (strData.isEmpty()) continue;
|
||
strData = GetSafeUtf(strData);
|
||
|
||
QString strID;
|
||
QStringList strListID = Find(element,"a","class","b").attribute("onclick").split(",");
|
||
if(strListID.length() > 2)
|
||
strID = strListID.at(1).trimmed().replace("'","");
|
||
|
||
QString strNick = Find(element,"a","class","b").toPlainText().trimmed();
|
||
strParent = strNick;
|
||
|
||
QString strDatetest = Find(element,"span","class","comment_date txt_sub p11 ls0").toPlainText().trimmed();
|
||
QString strDate;
|
||
if(strDatetest.count(".") == 0)
|
||
{
|
||
strDate = QDateTime::currentDateTime().toString("yyyy-MM-dd");
|
||
strDate += (" " + strDatetest + ":00");
|
||
QDateTime getTime = QDateTime::fromString(strDate, "yyyy-MM-dd hh:mm:ss");
|
||
QDateTime nowTime = QDateTime::currentDateTime();
|
||
if(getTime > nowTime)
|
||
{
|
||
getTime.addDays(-1);
|
||
strDate = getTime.toString("yyyy-MM-dd hh:mm:ss");
|
||
}
|
||
}
|
||
else
|
||
{
|
||
strDate = Find(element,"span","class","comment_date txt_sub p11 ls0").toPlainText().trimmed().replace(".","-").replace("- "," ");
|
||
}
|
||
|
||
if (strDate.isEmpty()) continue;
|
||
QSqlQuery query;
|
||
|
||
query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,article_order,platform_id,article_hit,platform_title) VALUES ('daum','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:ROWNUM,:PLATFORMID,:HITS,:TITLE)").toUtf8());
|
||
query.bindValue(":URL",strUrl_.toUtf8());
|
||
query.bindValue(":ID",strID.toUtf8());
|
||
query.bindValue(":NICK",strNick.toUtf8());
|
||
query.bindValue(":DATA",strData.toUtf8());
|
||
query.bindValue(":DATE",strDate.toUtf8());
|
||
query.bindValue(":ROWNUM",nCount++);
|
||
query.bindValue(":PLATFORMID",m_strUrl.split("/").at(3).toUtf8());
|
||
query.bindValue(":HITS",strHits.toUtf8());
|
||
query.bindValue(":TITLE",strTitle.toUtf8());
|
||
|
||
if (query.exec()==false)
|
||
cout << "error : " << query.lastError().text().toStdString();
|
||
}
|
||
if (element.attribute("class").trimmed().left(commReComm.length()).compare(commReComm,Qt::CaseInsensitive) == 0)
|
||
{
|
||
QString strData = SqlString(Find(element,"span","class","comment_contents").toPlainText().trimmed());
|
||
if (strData.isEmpty()) continue;
|
||
|
||
QString strReParent = Find(element,"span","class","mention_nicknames text_counter txt_point b").toPlainText().trimmed();
|
||
if(strReParent.length() == 0)
|
||
strReParent = strParent;
|
||
|
||
QString strID;
|
||
QStringList strListID = Find(element,"a","class","b").attribute("onclick").split(",");
|
||
if(strListID.length() > 2)
|
||
strID = strListID.at(1).trimmed().replace("'","");
|
||
|
||
QString strNick = Find(element,"a","class","b").toPlainText().trimmed();
|
||
QString strDatetest = Find(element,"span","class","comment_date txt_sub p11 ls0").toPlainText().trimmed();
|
||
QString strDate;
|
||
if(strDatetest.count(".") == 0)
|
||
{
|
||
strDate = QDateTime::currentDateTime().toString("yyyy-MM-dd");
|
||
strDate += (" " + strDatetest + ":00");
|
||
QDateTime getTime = QDateTime::fromString(strDate, "yyyy-MM-dd hh:mm:ss");
|
||
QDateTime nowTime = QDateTime::currentDateTime();
|
||
if(getTime > nowTime)
|
||
{
|
||
getTime.addDays(-1);
|
||
strDate = getTime.toString("yyyy-MM-dd hh:mm:ss");
|
||
}
|
||
}
|
||
else
|
||
{
|
||
strDate = Find(element,"span","class","comment_date txt_sub p11 ls0").toPlainText().trimmed().replace(".","-").replace("- "," ");
|
||
}
|
||
|
||
if (strDate.isEmpty()) continue;
|
||
QSqlQuery query;
|
||
|
||
query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,article_parent,article_order,platform_id,article_hit,platform_title) VALUES ('daum','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:PARENT,:ROWNUM,:PLATFORMID,:HITS,:TITLE)").toUtf8());
|
||
query.bindValue(":URL",strUrl_.toUtf8());
|
||
query.bindValue(":ID",strID.toUtf8());
|
||
query.bindValue(":NICK",strNick.toUtf8());
|
||
query.bindValue(":DATA",strData.toUtf8());
|
||
query.bindValue(":DATE",strDate.toUtf8());
|
||
query.bindValue(":PARENT",strReParent.toUtf8());
|
||
//query.bindValue(":URLREPLY",m_strReper.toUtf8());
|
||
query.bindValue(":ROWNUM",nCount++);
|
||
query.bindValue(":PLATFORMID",m_strUrl.split("/").at(3).toUtf8());
|
||
query.bindValue(":HITS",strHits.toUtf8());
|
||
query.bindValue(":TITLE",strTitle.toUtf8());
|
||
//QWebView::page()->mainFrame()->evaluateJavaScript("");
|
||
if (query.exec()==false)
|
||
cout << "error : " << query.lastError().text().toStdString();
|
||
}
|
||
}
|
||
}
|
||
}
|
||
m_bUse = true;
|
||
}
|
||
|
||
foreach(QWebFrame *childFrame, frame->childFrames())
|
||
saveFrameDaumCafeUrl(childFrame);
|
||
}
|
||
|
||
void SCrawler::saveFrameDaumBlogUrl(QWebFrame *frame){}
|
||
void SCrawler::saveFrameDaumBlogComment(QWebFrame *frame){}
|
||
void SCrawler::saveFrameDaumBlogList(QWebFrame *frame){}
|
||
|
||
void SCrawler::saveFrameNewsList(QWebFrame *frame)
|
||
{
|
||
if (m_bUse == true) return;
|
||
QWebElement notFound = Find(frame->documentElement(),"div","class","no_content");
|
||
if(notFound.isNull() == false)
|
||
{
|
||
m_bLast = true;
|
||
return;
|
||
}
|
||
QWebElement eleMain = Find(frame->documentElement(),"div","class","srch_result_area headline");
|
||
foreach(QWebElement eleSub,eleMain.findAll("div"))
|
||
{
|
||
if (eleSub.attribute("class") == QString("info"))
|
||
{
|
||
QString str = Find(eleSub,"a","class","go_naver").attribute("href");
|
||
if (str.trimmed().isEmpty()) continue;
|
||
if (str.contains("http://sports")) continue;
|
||
m_bNothing = true;
|
||
cout << "o " << str.toStdString() << endl;
|
||
}
|
||
}
|
||
QStringList strTotal = bodydata.GetNumber(Find(frame->documentElement(),"span","class","result_num").toPlainText().trimmed());
|
||
QVector <int> vecTotal;
|
||
foreach(QString str,strTotal)
|
||
{
|
||
if (str.trimmed().isEmpty() == false)
|
||
vecTotal.push_back(str.toInt());
|
||
}
|
||
|
||
if (vecTotal.size() == 3)
|
||
{
|
||
if (vecTotal[0] >= vecTotal[1]) m_bLast = true;
|
||
if (vecTotal[1] == vecTotal[2]) m_bLast = true;
|
||
}
|
||
else
|
||
m_bError = true;
|
||
m_bUse = true;
|
||
}
|
||
|
||
|
||
bool SCrawler::saveFrameNewsUrl(QWebFrame *frame)
|
||
{
|
||
if (m_bUse) return true;
|
||
|
||
{
|
||
QString strQuery = "delete from ";
|
||
strQuery += m_strTable + " where article_url = '";
|
||
strQuery += m_strUrl + "'";
|
||
QSqlQuery query;
|
||
if(query.exec(strQuery.toUtf8()) == false)
|
||
{
|
||
cout << query.lastError().text().toStdString();
|
||
cout << query.lastQuery().toStdString();
|
||
}
|
||
}
|
||
|
||
|
||
|
||
QString strTitle,strDate,strData,strPlatID,strPlatTitle,strlike,strReply,strSympathy;
|
||
{
|
||
QWebElement element = Find(frame->documentElement(),"div","class","article_info");
|
||
{
|
||
strTitle = Find(element,"h3","id","articleTitle").toPlainText(); // Title;
|
||
strDate = Find(element,"span","class","t11").toPlainText(); // Date
|
||
}
|
||
strData = Find(frame->documentElement(),"div","id","articleBodyContents").toPlainText();
|
||
strlike = Find(frame->documentElement(),"div","class","u_likeit_module").toPlainText();
|
||
strReply = Find(frame->documentElement(), "span", "class", "lo_txt").toPlainText().replace(QRegExp("[\\D]"), "").trimmed(); // normal
|
||
strSympathy = frame->documentElement().findFirst("em.u_cnt").toPlainText().replace(QRegExp("[\\D]"), "").trimmed();
|
||
if (strReply.isEmpty())
|
||
{
|
||
strReply = Find(frame->documentElement(), "a", "class", "reply_count").toPlainText().replace(QRegExp("[\\D]"), "").trimmed(); // entertain
|
||
}
|
||
|
||
if (strReply.isEmpty())
|
||
{
|
||
strReply = Find(frame->documentElement(), "span", "id", "newsCommentCount").toPlainText().replace(QRegExp("[\\D]"), "").trimmed(); // sports
|
||
}
|
||
|
||
if (strReply.isEmpty())
|
||
{
|
||
strReply = Find(frame->documentElement(), "span", "class", "u_cbox_count").toPlainText().replace(QRegExp("[\\D]"), "").trimmed();
|
||
}
|
||
qDebug() << strReply << ", " << strSympathy;
|
||
|
||
if ((strReply.isEmpty() || (strReply == "0") || strSympathy.isEmpty() || (strSympathy == "0")) && (++m_nRetryCount < RETRY_MAX))
|
||
{
|
||
//qDebug() << "singleshot";
|
||
QTimer::singleShot(1000, this, SLOT(reloadPage()));
|
||
return false;
|
||
}
|
||
|
||
if (strSympathy.isEmpty())
|
||
strSympathy = "0";
|
||
|
||
//entertainment
|
||
if (strTitle.isEmpty())
|
||
{
|
||
QWebElement elementTitle = Find(frame->documentElement(),"div","class","end_ct_area");
|
||
strTitle = Find(elementTitle,"p","class","end_tit").toPlainText();
|
||
}
|
||
//entertainment
|
||
if (strDate.isEmpty()) strDate = Find(element,"em").toPlainText();
|
||
if (strData.isEmpty()) strData = Find(frame->documentElement(),"div","id","articeBody").toPlainText();
|
||
|
||
if (strlike.isEmpty() == false) strData += "\r\nlike(" + QString::number(strlike.toInt()) + ")";
|
||
else
|
||
{
|
||
//Debug("out.html",m_page->mainFrame()->toHtml());
|
||
}
|
||
|
||
|
||
|
||
//sport
|
||
{
|
||
if(strTitle.isEmpty())
|
||
{
|
||
strTitle = Find(frame->documentElement(),"h4","class","title").toPlainText();
|
||
}
|
||
if(strData.isEmpty())
|
||
{
|
||
strData = Find(frame->documentElement(), "div", "id", "newsEndContents").toPlainText();
|
||
QString strSpam = Find(frame->documentElement(), "div", "class", "link_news").toPlainText();
|
||
QString strSource = Find(frame->documentElement(), "p", "class", "source").toPlainText();
|
||
strData = strData.left(strData.length() - strSpam.length() - strSource.length());
|
||
}
|
||
if(strDate.isEmpty())
|
||
{
|
||
strDate = frame->documentElement().findFirst("div[class='info']>span").toPlainText();
|
||
QRegExp reDate("([\\d]{4}).([\\d]{2}).([\\d]{2})");
|
||
QRegExp reTime("([\\d]{2}):([\\d]{2})");
|
||
int pos = 0;
|
||
QString date;
|
||
QString time;
|
||
while((pos = reDate.indexIn(strDate, pos)) != -1)
|
||
{
|
||
date = reDate.cap(1) + "-" + reDate.cap(2) + "-" + reDate.cap(3);
|
||
break;
|
||
}
|
||
pos = 0;
|
||
while((pos = reTime.indexIn(strDate, pos)) != -1)
|
||
{
|
||
if(strDate.contains("오후") && (reTime.cap(1) != "12"))
|
||
time = QString::number(reTime.cap(1).toInt() + 12) + ":" + reTime.cap(2);
|
||
else
|
||
time = reTime.cap(1) + ":" + reTime.cap(2);
|
||
break;
|
||
}
|
||
|
||
strDate = date + " " + time + ":00";
|
||
}
|
||
}
|
||
|
||
|
||
|
||
element = Find(frame->documentElement(),"div","class","press_logo");
|
||
if(!element.isNull())
|
||
{
|
||
strPlatID = Find(element,"a").attribute("href");
|
||
strPlatTitle = Find(element,"img").attribute("alt");
|
||
QStringList strlistPlat = strPlatID.split(".");
|
||
if(strlistPlat.size() > 2)
|
||
{
|
||
if (strlistPlat.at(0) == QString("http://www"))
|
||
strPlatID = strlistPlat.at(1);
|
||
}
|
||
}
|
||
else //sports
|
||
{
|
||
element = frame->documentElement().findFirst("span[class='logo']>img");
|
||
strPlatTitle = element.attribute("alt");
|
||
QWebElement link = Find(frame->documentElement(), "a", "class", "press_link");
|
||
QString strLink = link.attribute("href");
|
||
strPlatID = strLink.left(strLink.mid(8).indexOf('/') + 7);
|
||
QStringList strlistPlat = strPlatID.split(".");
|
||
if(strlistPlat.size() > 2)
|
||
{
|
||
if (strlistPlat.at(0) == QString("http://www"))
|
||
strPlatID = strlistPlat.at(1);
|
||
}
|
||
}
|
||
}
|
||
bodydata.setTable(m_strTable);
|
||
bodydata.setData(bodydata.GetSafeUtf(strTitle), SCrawlerData::ARTICLE_TITLE);
|
||
bodydata.setData(bodydata.GetSafeUtf(strData), SCrawlerData::ARTICLE_DATA);
|
||
bodydata.setData(strPlatID,SCrawlerData::PLATFORM_ID);
|
||
bodydata.setData(strPlatTitle,SCrawlerData::PLATFORM_TITLE);
|
||
bodydata.setData(strDate, SCrawlerData::ARTICLE_DATE);
|
||
bodydata.setData("naver", SCrawlerData::PLATFORM_NAME);
|
||
bodydata.setData("news", SCrawlerData::PLATFORM_FORM);
|
||
bodydata.setData("body", SCrawlerData::ARTICLE_FORM);
|
||
bodydata.setData(m_strUrl, SCrawlerData::ARTICLE_URL);
|
||
bodydata.setData(m_strKeywordID, SCrawlerData::KEYWORD_ID);
|
||
bodydata.setData(strReply, SCrawlerData::ARTICLE_ORDER);
|
||
bodydata.setData(strSympathy, SCrawlerData::REPLY_URL);
|
||
m_bUse = true;
|
||
return true;
|
||
}
|
||
|
||
bool SCrawler::saveFrameNewsComment(QWebFrame *frame)
|
||
{
|
||
if (m_bUse) return true;
|
||
static bool bReplyDone = false;
|
||
//static int reply_index = 0;
|
||
static int iLoaded = 0;
|
||
static bool bProcessed = false;
|
||
|
||
if (bProcessed)
|
||
return false;
|
||
bProcessed = true;
|
||
|
||
|
||
//qDebug() << frame->baseUrl().toString();
|
||
//qDebug() << "executed";
|
||
|
||
|
||
if(frame->baseUrl().toString().contains("entertain") && !frame->baseUrl().toString().contains("comment"))
|
||
{
|
||
m_page->mainFrame()->load(QUrl(frame->baseUrl().toString().replace("read", "comment/list")));
|
||
bProcessed = false;
|
||
return false;
|
||
}
|
||
|
||
|
||
if(frame->baseUrl().toString().contains("sports") && !frame->baseUrl().toString().contains("m_view=1"))
|
||
{
|
||
m_page->mainFrame()->load(QUrl(frame->baseUrl().toString() + "&m_view=1"));
|
||
bProcessed = false;
|
||
return false;
|
||
}
|
||
|
||
if(m_nRetryCount < RETRY_MAX && !bReplyDone)
|
||
{
|
||
QWebElement u_cbox_paginate = Find(frame->documentElement(), "div", "class", "u_cbox_paginate");
|
||
if (u_cbox_paginate.isNull())
|
||
{
|
||
++m_nRetryCount;
|
||
// qDebug() << m_nRetryCount;
|
||
QTimer::singleShot(RETRY_INTERVAL, this, SLOT(reloadPage()));
|
||
bProcessed = false;
|
||
return false;
|
||
}
|
||
else
|
||
{
|
||
QWebElement a = Find(frame->documentElement(), "a", "class", "u_cbox_btn_more __cbox_page_button");
|
||
if(!a.isNull())
|
||
{
|
||
a.evaluateJavaScript("var evObj = new Event('click', {bubbles: true, cancelable: true, view: window}); this.dispatchEvent(evObj);");
|
||
QTimer::singleShot(300, this, SLOT(reloadPage()));
|
||
// qDebug() << "load comments";
|
||
|
||
//QWebElement a = Find(frame->documentElement(), "a", "class", "u_cbox_btn_more __cbox_page_button");
|
||
QWebElementCollection lis = frame->findAllElements("div[class='u_cbox_content_wrap']>ul[class='u_cbox_list']>li");
|
||
|
||
// qDebug() << lis.count();
|
||
if (lis.count() != iLoaded)
|
||
{
|
||
iLoaded = lis.count();
|
||
bProcessed = false;
|
||
return false;
|
||
}
|
||
else
|
||
{
|
||
bReplyDone = true;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
/*
|
||
QWebElement u_cbox_paginate = Find(frame->documentElement(), "div", "class", "u_cbox_paginate");
|
||
//qDebug() << lis.count();
|
||
|
||
if (!u_cbox_paginate.isNull())
|
||
{
|
||
QWebElement a = Find(frame->documentElement(), "a", "class", "u_cbox_btn_more __cbox_page_button");
|
||
//QWebElementCollection lis = frame->findAllElements("div[class='u_cbox_content_wrap']>ul[class='u_cbox_list']>li");
|
||
|
||
if(!a.isNull())
|
||
{
|
||
a.evaluateJavaScript("var evObj = new Event('click', {bubbles: true, cancelable: true, view: window}); this.dispatchEvent(evObj);");
|
||
QTimer::singleShot(300, this, SLOT(reloadPage()));
|
||
qDebug() << "load comments";
|
||
|
||
//QWebElement a = Find(frame->documentElement(), "a", "class", "u_cbox_btn_more __cbox_page_button");
|
||
QWebElementCollection lis = frame->findAllElements("div[class='u_cbox_content_wrap']>ul[class='u_cbox_list']>li");
|
||
|
||
qDebug() << lis.count();
|
||
if (lis.count() != iLoaded)
|
||
{
|
||
iLoaded = lis.count();
|
||
bProcessed = false;
|
||
return false;
|
||
}
|
||
}
|
||
|
||
//return false;
|
||
while(!bReplyDone)
|
||
{
|
||
QWebElement current = Find(a, "em", "class", "u_cbox_page_on __cbox_page_current");
|
||
QWebElement total = Find(a, "em", "class", "u_cbox_page_total __cbox_page_total");
|
||
QString str_current = current.toPlainText();
|
||
QString str_total = total.toPlainText();
|
||
bool ok;
|
||
|
||
int n_current = str_current.replace(",", "").toInt(&ok);
|
||
if(!ok)
|
||
break;
|
||
|
||
int n_total = str_total.replace(",", "").toInt(&ok);
|
||
if(!ok)
|
||
break;
|
||
|
||
if(n_current >= n_total)
|
||
{
|
||
bReplyDone = true;
|
||
break;
|
||
}
|
||
a.evaluateJavaScript("var evObj = new Event('click', {bubbles: true, cancelable: true, view: window}); this.dispatchEvent(evObj);");
|
||
QTimer::singleShot(300, this, SLOT(reloadPage()));
|
||
qDebug() << "load comments";
|
||
return false;
|
||
}
|
||
|
||
|
||
QWebElementCollection reply_btns = frame->findAllElements("a[class='u_cbox_btn_reply']");
|
||
for(;reply_index < reply_btns.count() ; reply_index++)
|
||
{
|
||
QWebElement btn = Find(reply_btns[reply_index], "span", "class", "u_cbox_reply_cnt");
|
||
if(btn.isNull())
|
||
continue;
|
||
else
|
||
{
|
||
btn.evaluateJavaScript("var evObj = new Event('click', {bubbles: true, cancelable: true, view: window}); this.dispatchEvent(evObj);");
|
||
QTimer::singleShot(200, this, SLOT(reloadPage()));
|
||
//reply_index += 1;
|
||
qDebug() << reply_index;
|
||
return false;
|
||
}
|
||
}
|
||
*/
|
||
|
||
/*
|
||
foreach(QWebElement a, reply_btns)
|
||
{
|
||
QWebElement btn = Find(a, "span", "class", "u_cbox_reply_cnt");
|
||
if(btn.isNull())
|
||
continue;
|
||
else
|
||
{
|
||
btn.evaluateJavaScript("var evObj = new Event('click', {bubbles: true, cancelable: true, view: window}); this.dispatchEvent(evObj);");
|
||
//QTimer::singleShot(100, this, SLOT(reloadPage()));
|
||
qDebug() << "qq";
|
||
//return false;
|
||
|
||
}
|
||
}
|
||
|
||
}
|
||
*/
|
||
/*
|
||
else
|
||
{
|
||
if(m_nRetryCount < RETRY_MAX)
|
||
{
|
||
m_nRetryCount++;
|
||
qDebug() << m_nRetryCount;
|
||
QTimer::singleShot(RETRY_INTERVAL, this, SLOT(reloadPage()));
|
||
bProcessed = false;
|
||
return false;
|
||
}
|
||
*/
|
||
/*
|
||
else
|
||
{
|
||
m_bUse = true;
|
||
return true;
|
||
}
|
||
*/
|
||
//}
|
||
/*
|
||
{
|
||
QWebElementCollection lis = frame->findAllElements("div[class='u_cbox_content_wrap']>ul[class='u_cbox_list']>li");
|
||
foreach (QWebElement li, lis)
|
||
{
|
||
QWebElement btn = li.findFirst("span[class='u_cbox_reply_cnt']");
|
||
QWebElement atag = li.findFirst("a[class='u_cbox_btn_reply']");
|
||
if (!btn.isNull() && !atag.isNull())
|
||
{
|
||
atag.evaluateJavaScript("var evObj = new Event('click', {bubbles: true, cancelable: true, view: window}); this.dispatchEvent(evObj);");
|
||
QTimer::singleShot(300, this, SLOT(reloadPage()));
|
||
qDebug() << "click reply:" << btn.toPlainText();
|
||
|
||
bProcessed = false;
|
||
return false;
|
||
}
|
||
QWebElement div_load_more = li.findFirst("div[class='u_cbox_paginate']");
|
||
if (!div_load_more.isNull())
|
||
{
|
||
QWebElement load_more = div_load_more.findFirst("a[class='u_cbox_btn_more __cbox_page_button']");
|
||
if (!load_more.isNull())
|
||
{
|
||
load_more.evaluateJavaScript("var evObj = new Event('click', {bubbles: true, cancelable: true, view: window}); this.dispatchEvent(evObj);");
|
||
QTimer::singleShot(300, this, SLOT(reloadPage()));
|
||
qDebug() << "load more reply";
|
||
bProcessed = false;
|
||
return false;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
*/
|
||
|
||
QWebElementCollection reply_btns = frame->findAllElements("a[class^='u_cbox_btn_reply']");
|
||
|
||
foreach (QWebElement ele, reply_btns)
|
||
{
|
||
QWebElement btn = ele.findFirst("span[class='u_cbox_reply_cnt']");
|
||
|
||
if ((ele.attribute("class") == "u_cbox_btn_reply") && !btn.isNull())
|
||
{
|
||
ele.evaluateJavaScript("var evObj = new Event('click', {bubbles: true, cancelable: true, view: window}); this.dispatchEvent(evObj);");
|
||
QTimer::singleShot(250, this, SLOT(reloadPage()));
|
||
//qDebug() << "click reply:" << btn.toPlainText();
|
||
|
||
bProcessed = false;
|
||
return false;
|
||
}
|
||
}
|
||
|
||
QWebElementCollection allPaginate = frame->documentElement().findAll("div[class='u_cbox_paginate']");
|
||
foreach (QWebElement ele, allPaginate)
|
||
{
|
||
QWebElement load_more = ele.findFirst("a[class='u_cbox_btn_more __cbox_page_button']");
|
||
if (!load_more.isNull())
|
||
{
|
||
load_more.evaluateJavaScript("var evObj = new Event('click', {bubbles: true, cancelable: true, view: window}); this.dispatchEvent(evObj);");
|
||
QTimer::singleShot(250, this, SLOT(reloadPage()));
|
||
//qDebug() << "load more reply";
|
||
bProcessed = false;
|
||
return false;
|
||
}
|
||
}
|
||
|
||
/*
|
||
//for(;reply_index < reply_btns.count() ;)
|
||
for (int k = 0; k < reply_btns.count(); ++k)
|
||
{
|
||
//QWebElement btn = Find(reply_btns[reply_index], "span", "class", "u_cbox_reply_cnt");
|
||
QWebElement btn = Find(reply_btns[k], "span", "class", "u_cbox_reply_cnt");
|
||
//reply_index += 1;
|
||
|
||
if(btn.isNull())
|
||
continue;
|
||
else
|
||
{
|
||
//QWebElement btnA = Find(reply_btns[reply_index - 1], "a", "class", "u_cbox_btn_reply");
|
||
reply_btns[k].evaluateJavaScript("var evObj = new Event('click', {bubbles: true, cancelable: true, view: window}); this.dispatchEvent(evObj);");
|
||
qDebug() << "load??????????????";
|
||
|
||
qDebug() << reply_btns.count();
|
||
QTimer::singleShot(300, this, SLOT(reloadPage()));
|
||
bProcessed = false;
|
||
|
||
return false;
|
||
}
|
||
}
|
||
*/
|
||
{
|
||
QWebElement logo = Find(frame->documentElement(),"div","class","press_logo");
|
||
QString strPlatID, strPlatTitle;
|
||
{
|
||
strPlatID = Find(logo,"a").attribute("href");
|
||
strPlatTitle = Find(logo,"img").attribute("alt");
|
||
}
|
||
QStringList strlistPlat = strPlatID.split(".");
|
||
if(strlistPlat.size() > 2)
|
||
{
|
||
if (strlistPlat.at(0) == QString("http://www"))
|
||
strPlatID = strlistPlat.at(1);
|
||
}
|
||
//QWebElement ul = frame->findFirstElement("div[class='u_cbox_content_wrap']>ul[class='u_cbox_list']");
|
||
QWebElementCollection lis = frame->findAllElements("div[class='u_cbox_content_wrap']>ul[class='u_cbox_list']>li");
|
||
int order = 0;
|
||
foreach(QWebElement li, lis)
|
||
{
|
||
//qDebug() << "li";
|
||
QWebElement comment_box = li.findFirst("div[class='u_cbox_comment_box']");
|
||
QString strParent;
|
||
{
|
||
QString strID, strNick, strData, strLike, strDislike, strDate;
|
||
strData = Find(comment_box, "span", "class", "u_cbox_contents").toPlainText();
|
||
strNick = strParent = strID = Find(comment_box, "span", "class", "u_cbox_name").toPlainText();
|
||
strLike = Find(comment_box, "em", "class", "u_cbox_cnt_recomm").toPlainText().replace(",", "");
|
||
strDislike = Find(comment_box, "em", "class", "u_cbox_cnt_unrecomm").toPlainText().replace(",", "");
|
||
strData += "\n(goodCount:" + strLike +")\n(badCount:" + strDislike + ")";
|
||
|
||
strDate = Find(comment_box, "span", "class", "u_cbox_date").toPlainText();
|
||
if(strDate.contains(":"))
|
||
strDate += ":00";
|
||
else
|
||
{
|
||
QDateTime current_time = QDateTime::currentDateTime();
|
||
QRegExp rx("(\\d+)");
|
||
int pos = 0;
|
||
QString strTime;
|
||
while ((pos = rx.indexIn(strDate, pos)) != -1)
|
||
{
|
||
strTime = rx.cap(1);
|
||
pos += rx.matchedLength();
|
||
}
|
||
|
||
if(strDate.contains("시간"))
|
||
{
|
||
current_time = current_time.addSecs(-(60 * 60 * strTime.toInt()));
|
||
}
|
||
else if(strDate.contains("일"))
|
||
{
|
||
current_time = current_time.addDays(-(strTime.toInt()));
|
||
}
|
||
else if(strDate.contains("분"))
|
||
{
|
||
current_time = current_time.addSecs(-(60 * strTime.toInt()));
|
||
}
|
||
else
|
||
{
|
||
;
|
||
}
|
||
strDate = current_time.toString("yyyy-MM-dd hh:mm:ss");
|
||
// qDebug() << strDate;
|
||
}
|
||
{
|
||
QSqlQuery query;
|
||
query.prepare(QString("insert into " + m_strTable +
|
||
" (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_order,platform_id,platform_title, article_date) "
|
||
"VALUES ('naver','news','reply',:URL,:ID,:NICK,:DATA,:ROWNUM,:PLATFORMID,:TITLE,:DATE)").toUtf8());
|
||
|
||
query.bindValue(":URL",m_strUrl.replace("&m_view=1","").toUtf8());
|
||
query.bindValue(":ID",strID.toUtf8());
|
||
query.bindValue(":NICK",strNick.toUtf8());
|
||
query.bindValue(":DATA",strData.toUtf8());
|
||
query.bindValue(":ROWNUM",order++);
|
||
query.bindValue(":PLATFORMID",strPlatID.toUtf8());
|
||
query.bindValue(":TITLE",strPlatTitle.toUtf8());
|
||
query.bindValue(":DATE", strDate.toUtf8());
|
||
if (query.exec()==false)
|
||
cout << "error : " << query.lastError().text().toStdString();
|
||
}
|
||
}
|
||
QWebElement reply_area = li.findFirst("div[class='u_cbox_reply_area']");
|
||
QWebElementCollection sub_lis = reply_area.findAll("ul[class='u_cbox_list']>li");
|
||
|
||
foreach(QWebElement sub_li, sub_lis)
|
||
{
|
||
QString strID, strNick, strData, strDate;
|
||
strData = Find(sub_li, "span", "class", "u_cbox_contents").toPlainText();
|
||
strNick = strID = Find(sub_li, "span", "class", "u_cbox_name").toPlainText();
|
||
strDate = Find(sub_li, "span", "class", "u_cbox_date").toPlainText();
|
||
if(strDate.contains(":"))
|
||
strDate += ":00";
|
||
else
|
||
{
|
||
QDateTime current_time = QDateTime::currentDateTime();
|
||
QRegExp rx("(\\d+)");
|
||
int pos = 0;
|
||
QString strTime;
|
||
while ((pos = rx.indexIn(strDate, pos)) != -1)
|
||
{
|
||
strTime = rx.cap(1);
|
||
pos += rx.matchedLength();
|
||
}
|
||
|
||
if(strDate.contains("시간"))
|
||
{
|
||
current_time = current_time.addSecs(-(60 * 60 * strTime.toInt()));
|
||
}
|
||
else if(strDate.contains("일"))
|
||
{
|
||
current_time = current_time.addDays(-(strTime.toInt()));
|
||
}
|
||
else if(strDate.contains("분"))
|
||
{
|
||
current_time = current_time.addSecs(-(60 * strTime.toInt()));
|
||
}
|
||
else
|
||
{
|
||
;
|
||
}
|
||
strDate = current_time.toString("yyyy-MM-dd hh:mm:ss");
|
||
}
|
||
|
||
{
|
||
QSqlQuery query;
|
||
query.prepare(QString("insert into " + m_strTable +
|
||
" (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_order,platform_id,platform_title, article_date, article_parent) "
|
||
"VALUES ('naver','news','reply',:URL,:ID,:NICK,:DATA,:ROWNUM,:PLATFORMID,:TITLE,:DATE,:PARENT)").toUtf8());
|
||
|
||
query.bindValue(":URL",m_strUrl.replace("&m_view=1","").toUtf8());
|
||
query.bindValue(":ID",strID.toUtf8());
|
||
query.bindValue(":NICK",strNick.toUtf8());
|
||
query.bindValue(":DATA",strData.toUtf8());
|
||
query.bindValue(":ROWNUM",order++);
|
||
query.bindValue(":PLATFORMID",strPlatID.toUtf8());
|
||
query.bindValue(":TITLE",strPlatTitle.toUtf8());
|
||
query.bindValue(":DATE", strDate.toUtf8());
|
||
query.bindValue(":PARENT", strParent.toUtf8());
|
||
if (query.exec()==false)
|
||
cout << "error : " << query.lastError().text().toStdString();
|
||
|
||
}
|
||
}
|
||
}
|
||
qDebug() << "lis count: " << lis.count();
|
||
}
|
||
|
||
|
||
//Debug("c:\\data\\replytest.html", frame->toHtml());
|
||
m_bUse = true;
|
||
bProcessed = false;
|
||
|
||
return true;
|
||
}
|
||
|
||
|
||
QWebElement SCrawler::Find(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind)
|
||
{
|
||
QWebElementCollection elements = _FindElement.findAll(_strElement);
|
||
foreach (QWebElement element, elements)
|
||
{
|
||
if (element.attribute(_strAttrib) == _strFind)
|
||
{
|
||
return element;
|
||
}
|
||
}
|
||
QWebElement element;
|
||
return element;
|
||
}
|
||
|
||
QWebElement SCrawler::FindMid(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind, const int _strStart, const int _strLength)
|
||
{
|
||
QWebElementCollection elements = _FindElement.findAll(_strElement);
|
||
foreach (QWebElement element, elements)
|
||
{
|
||
QString str = element.attribute(_strAttrib).trimmed().mid(_strStart,_strLength);
|
||
if (QString::compare(str,_strFind,Qt::CaseInsensitive)==0)
|
||
{
|
||
return element;
|
||
}
|
||
}
|
||
QWebElement element;
|
||
return element;
|
||
}
|
||
|
||
QWebElement SCrawler::FindMid(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind, const int _strStart)
|
||
{
|
||
int _strLength = _strFind.length();
|
||
return FindMid(_FindElement, _strElement, _strAttrib, _strFind, _strStart, _strLength);
|
||
}
|
||
|
||
QWebElement SCrawler::FindRight(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind)
|
||
{
|
||
QWebElementCollection elements = _FindElement.findAll(_strElement);
|
||
foreach (QWebElement element, elements)
|
||
{
|
||
QString str = element.attribute(_strAttrib).trimmed().right(_strFind.length());
|
||
cout << "FindRight : " << str.toStdString() << endl;
|
||
cout << "FindRight right : " << _strFind.toStdString() << endl;
|
||
if (QString::compare(str,_strFind,Qt::CaseInsensitive)==0)
|
||
{
|
||
return element;
|
||
}
|
||
}
|
||
QWebElement element;
|
||
return element;
|
||
}
|
||
|
||
QWebElement SCrawler::FindLeft(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind)
|
||
{
|
||
int _strStart = 0;
|
||
return FindMid(_FindElement, _strElement, _strAttrib, _strFind, _strStart);
|
||
}
|
||
|
||
|
||
|
||
QList<QWebElement> SCrawler::FindAllMid(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind, const int _strStart, const int _strLength)
|
||
{
|
||
QWebElementCollection elements = _FindElement.findAll(_strElement);
|
||
QList<QWebElement> returnElements = QList<QWebElement>();
|
||
|
||
foreach (QWebElement element, elements)
|
||
{
|
||
QString str = element.attribute(_strAttrib).trimmed().mid(_strStart,_strLength);
|
||
if (QString::compare(str,_strFind,Qt::CaseInsensitive)==0)
|
||
{
|
||
returnElements.append(element);
|
||
}
|
||
}
|
||
return returnElements;
|
||
}
|
||
|
||
bool SCrawler::getProxyList(QString &_str)
|
||
{
|
||
QSqlQuery sqlquery;
|
||
QString strquery = "select proxy, port from Proxy limit 300";
|
||
QString queryutf = strquery.toUtf8();
|
||
|
||
if(sqlquery.exec(queryutf) == false)
|
||
{
|
||
return false;
|
||
}
|
||
|
||
while(sqlquery.next())
|
||
{
|
||
QString str = sqlquery.value(0).toString();
|
||
str += ",";
|
||
str += sqlquery.value(1).toString();
|
||
str += "\n";
|
||
_str += str;
|
||
}
|
||
|
||
return true;
|
||
}
|
||
|
||
bool SCrawler::setProxyFromFile()
|
||
{
|
||
QFile file("proxy.txt");
|
||
QRegExp rx("^\\s*([\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3})[^\\d]*([\\d]*)");
|
||
|
||
if (file.open(QIODevice::ReadOnly | QIODevice::Text))
|
||
{
|
||
QVector <QStringList> vecProxy;
|
||
while (!file.atEnd())
|
||
{
|
||
QString str = QString(file.readLine());
|
||
if (str.isEmpty()) continue;
|
||
int pos = 0;
|
||
QStringList strList;
|
||
while ((pos = rx.indexIn(str, pos)) != -1)
|
||
{
|
||
if (!rx.cap(1).isEmpty())
|
||
strList.append(rx.cap(1));
|
||
if (!rx.cap(2).isEmpty())
|
||
strList.append(rx.cap(2));
|
||
pos += rx.matchedLength();
|
||
}
|
||
if (!strList.isEmpty())
|
||
vecProxy.push_back(strList);
|
||
}
|
||
if (vecProxy.size() > 0)
|
||
{
|
||
QStringList strList = vecProxy.at(rand()%vecProxy.size());
|
||
//QNetworkAccessManager *manager = new QNetworkAccessManager;
|
||
|
||
switch(strList.size())
|
||
{
|
||
case 1:
|
||
cout << "p : " << strList.at(0).toStdString() << " from File" << endl;
|
||
|
||
//manager->setProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0))));
|
||
//m_page->setNetworkAccessManager(manager);
|
||
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0))));
|
||
break;
|
||
case 2:
|
||
cout << "p : " << strList.at(0).toStdString() << ":" << strList.at(1).toStdString() << " from File" << endl;
|
||
|
||
//manager->setProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt())));
|
||
//m_page->setNetworkAccessManager(manager);
|
||
|
||
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt())));
|
||
break;
|
||
}
|
||
}
|
||
else
|
||
{
|
||
return false;
|
||
}
|
||
file.close();
|
||
return true;
|
||
}
|
||
else
|
||
return false;
|
||
}
|
||
|
||
bool SCrawler::setProxyFromDb()
|
||
{
|
||
QString proxyList;
|
||
|
||
if (getProxyList(proxyList))
|
||
{
|
||
QVector <QStringList> vecProxy;
|
||
QStringList strListProxy = proxyList.split("\n");
|
||
foreach(QString str, strListProxy)
|
||
{
|
||
str = str.trimmed();
|
||
if (str.isEmpty()) continue;
|
||
vecProxy.push_back(str.split(","));
|
||
}
|
||
if (vecProxy.size() > 0)
|
||
{
|
||
QStringList strList = vecProxy.at(rand()%vecProxy.size());
|
||
switch(strList.size())
|
||
{
|
||
case 1:
|
||
cout << "p : " << strList.at(0).toStdString() << " from DB" << endl;
|
||
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0))));
|
||
break;
|
||
case 2:
|
||
cout << "p : " << strList.at(0).toStdString() << ":" << strList.at(1).toStdString() << " from DB" << endl;
|
||
m_strProxyIP = strList.at(0);
|
||
m_nProxyPort = strList.at(1).toInt();
|
||
|
||
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt())));
|
||
/*
|
||
QString strProxyHost = "61.103.7.74";
|
||
int nPort = 2074;
|
||
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strProxyHost,nPort)));
|
||
*/
|
||
break;
|
||
}
|
||
return true;
|
||
}
|
||
else
|
||
{
|
||
return false;
|
||
}
|
||
}
|
||
else
|
||
{
|
||
return false;
|
||
}
|
||
}
|
||
|
||
|
||
void SCrawler::setProxy()
|
||
{
|
||
bool ok = setProxyFromFile() || setProxyFromDb();
|
||
//bool ok = false;
|
||
if (!ok)
|
||
cout << "No Proxy" << endl;
|
||
}
|
||
|
||
void SCrawler::deleteProxy()
|
||
{
|
||
if (m_strProxyIP.isEmpty()) return;
|
||
QSqlQuery sqlquery;
|
||
QString strquery = "delete from Proxy where proxy = '" + m_strProxyIP + "' and port = " + QString::number(m_nProxyPort);
|
||
if(sqlquery.exec(strquery.toUtf8()) == false)
|
||
{
|
||
cout << "Error : " << strquery.toStdString() << endl;
|
||
cout << sqlquery.lastError().text().toStdString() << endl;
|
||
}
|
||
}
|
||
|