|
|
|
|
@@ -38,11 +38,18 @@ SCrawler::SCrawler():QObject()
|
|
|
|
|
m_nRetryCount = 0;
|
|
|
|
|
m_bProcessed = false;
|
|
|
|
|
connect(m_page, SIGNAL(loadFinished(bool)), this, SLOT(saveResult(bool)));
|
|
|
|
|
srand(time(NULL));
|
|
|
|
|
srand(time(NULL));
|
|
|
|
|
|
|
|
|
|
m_pNAM = new QNetworkAccessManager(this);
|
|
|
|
|
m_page->setNetworkAccessManager(m_pNAM);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
SCrawler::~SCrawler()
|
|
|
|
|
{
|
|
|
|
|
m_page->setNetworkAccessManager(nullptr);
|
|
|
|
|
|
|
|
|
|
delete m_pNAM;
|
|
|
|
|
delete m_page;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void SCrawler::load(QStringList _strlistArgv)
|
|
|
|
|
@@ -159,22 +166,23 @@ void SCrawler::load(QStringList _strlistArgv)
|
|
|
|
|
}
|
|
|
|
|
cout << m_strUrl.toStdString() << endl;
|
|
|
|
|
|
|
|
|
|
QUrl url = QUrl(m_strUrl);
|
|
|
|
|
QUrl url = QUrl(m_strUrl);
|
|
|
|
|
if (url.scheme().isEmpty())
|
|
|
|
|
url.setScheme("http");
|
|
|
|
|
|
|
|
|
|
m_page->settings()->setAttribute(QWebSettings::AutoLoadImages,false);
|
|
|
|
|
QNetworkRequest *request = new QNetworkRequest;
|
|
|
|
|
request->setUrl(url);
|
|
|
|
|
QNetworkRequest request;
|
|
|
|
|
request.setUrl(url);
|
|
|
|
|
/*
|
|
|
|
|
request->setRawHeader("Cache-Control","max-age=0, no-cache");
|
|
|
|
|
request->setRawHeader("Pragma","no-cache");
|
|
|
|
|
request->setRawHeader("Expires","Thu, 01 Jan 1970 16:00:00 GMT");
|
|
|
|
|
request.setRawHeader("Cache-Control","max-age=0, no-cache");
|
|
|
|
|
request.setRawHeader("Pragma","no-cache");
|
|
|
|
|
request.setRawHeader("Expires","Thu, 01 Jan 1970 16:00:00 GMT");
|
|
|
|
|
*/
|
|
|
|
|
if (m_strReper.isEmpty() == false && m_nSelect == E_NAVER_CAFE_DATA)
|
|
|
|
|
request->setRawHeader("Referer",m_strReper.toLocal8Bit());
|
|
|
|
|
request->setRawHeader("Accept-Language","ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4,zh-CN;q=0.2,zh;q=0.2");
|
|
|
|
|
m_page->mainFrame()->load(*request);
|
|
|
|
|
if (m_strReper.isEmpty() == false && m_nSelect == E_NAVER_CAFE_DATA)
|
|
|
|
|
request.setRawHeader("Referer",m_strReper.toLocal8Bit());
|
|
|
|
|
request.setRawHeader("Accept-Language","ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4,zh-CN;q=0.2,zh;q=0.2");
|
|
|
|
|
m_page->mainFrame()->load(request);
|
|
|
|
|
|
|
|
|
|
m_bLast = false;
|
|
|
|
|
m_bError = false;
|
|
|
|
|
}
|
|
|
|
|
@@ -198,6 +206,8 @@ void SCrawler::saveResult(bool ok)
|
|
|
|
|
{
|
|
|
|
|
// qDebug() << "saveResult";
|
|
|
|
|
|
|
|
|
|
// cout << "page data: "<< m_page->bytesReceived() << endl;
|
|
|
|
|
|
|
|
|
|
if (!ok)
|
|
|
|
|
{
|
|
|
|
|
cout << "Failed loading";
|
|
|
|
|
@@ -209,14 +219,17 @@ void SCrawler::saveResult(bool ok)
|
|
|
|
|
//qDebug() << "load complete";
|
|
|
|
|
switch(m_nSelect)
|
|
|
|
|
{
|
|
|
|
|
case E_NAVER_NEWS_LIST:saveFrameNewsList(m_page->mainFrame());break;
|
|
|
|
|
case E_NAVER_NEWS_LIST:
|
|
|
|
|
saveFrameNewsList(m_page->mainFrame());
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case E_NAVER_NEWS_DATA:
|
|
|
|
|
{
|
|
|
|
|
static bool loaded = false;
|
|
|
|
|
if(!loaded)
|
|
|
|
|
{
|
|
|
|
|
loaded = true;
|
|
|
|
|
if(!saveFrameNewsUrl(m_page->mainFrame()))
|
|
|
|
|
if(saveFrameNewsUrl(m_page->mainFrame()) == false)
|
|
|
|
|
{
|
|
|
|
|
loaded = false;
|
|
|
|
|
return;
|
|
|
|
|
@@ -229,48 +242,57 @@ void SCrawler::saveResult(bool ok)
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case E_NAVER_NEWS_REPLY:
|
|
|
|
|
{
|
|
|
|
|
if(!saveFrameNewsComment(m_page->mainFrame()))
|
|
|
|
|
return;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case E_NAVER_CAFE_LIST:saveFrameCafeList(m_page->mainFrame());break;
|
|
|
|
|
|
|
|
|
|
case E_NAVER_CAFE_LIST:
|
|
|
|
|
saveFrameCafeList(m_page->mainFrame());
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case E_NAVER_CAFE_DATA:
|
|
|
|
|
{
|
|
|
|
|
saveFrameCafeUrl(m_page->mainFrame());
|
|
|
|
|
bodydata.sendDB();
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case E_NAVER_BLOG_LIST:
|
|
|
|
|
{
|
|
|
|
|
if(saveFrameList(m_page->mainFrame()))
|
|
|
|
|
break;
|
|
|
|
|
else
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
case E_NAVER_BLOG_BODY:
|
|
|
|
|
{
|
|
|
|
|
if(!saveFrameUrl(m_page->mainFrame()))
|
|
|
|
|
|
|
|
|
|
case E_NAVER_BLOG_LIST:
|
|
|
|
|
if(saveFrameList(m_page->mainFrame()) == false)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case E_NAVER_BLOG_BODY:
|
|
|
|
|
if(saveFrameUrl(m_page->mainFrame()) == false)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
bodydata.sendDB();
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case E_NAVER_BLOG_REPLY:saveFrameComment(m_page->mainFrame());break;
|
|
|
|
|
case E_DAUM_CAFE_LIST:saveFrameDaumCafeList(m_page->mainFrame());break;
|
|
|
|
|
|
|
|
|
|
case E_NAVER_BLOG_REPLY:
|
|
|
|
|
saveFrameComment(m_page->mainFrame());
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case E_DAUM_CAFE_LIST:
|
|
|
|
|
saveFrameDaumCafeList(m_page->mainFrame());
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case E_DAUM_CAFE_DATA:
|
|
|
|
|
{
|
|
|
|
|
saveFrameDaumCafeUrl(m_page->mainFrame());
|
|
|
|
|
bodydata.sendDB();
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case E_DAUM_BLOG_LIST:saveFrameDaumBlogList(m_page->mainFrame());break;
|
|
|
|
|
|
|
|
|
|
case E_DAUM_BLOG_LIST:
|
|
|
|
|
saveFrameDaumBlogList(m_page->mainFrame());
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case E_DAUM_BLOG_BODY:
|
|
|
|
|
{
|
|
|
|
|
saveFrameDaumBlogUrl(m_page->mainFrame());
|
|
|
|
|
bodydata.sendDB();
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case E_DAUM_BLOG_REPLY:saveFrameDaumBlogComment(m_page->mainFrame());break;
|
|
|
|
|
|
|
|
|
|
case E_DAUM_BLOG_REPLY:
|
|
|
|
|
saveFrameDaumBlogComment(m_page->mainFrame());
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
switch(m_nSelect)
|
|
|
|
|
@@ -296,8 +318,8 @@ void SCrawler::saveResult(bool ok)
|
|
|
|
|
cout << "last";
|
|
|
|
|
m_bLast = false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case E_NAVER_BLOG_REPLY:
|
|
|
|
|
case E_NAVER_NEWS_REPLY:
|
|
|
|
|
case E_DAUM_BLOG_REPLY:
|
|
|
|
|
@@ -327,9 +349,9 @@ void SCrawler::saveResult(bool ok)
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
qDebug() << "finish";
|
|
|
|
|
emit finished();
|
|
|
|
|
|
|
|
|
|
qDebug() << " finish";
|
|
|
|
|
emit finished();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int SCrawler::GetNumber(QString _str)
|
|
|
|
|
@@ -407,15 +429,13 @@ void SCrawler::reloadListPage()
|
|
|
|
|
|
|
|
|
|
bool SCrawler::saveFrameList(QWebFrame *frame)
|
|
|
|
|
{
|
|
|
|
|
|
|
|
|
|
if (m_bProcessed == false)
|
|
|
|
|
m_bProcessed = true;
|
|
|
|
|
else
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
//qDebug() << frame->documentElement().toPlainText();
|
|
|
|
|
|
|
|
|
|
if (m_bUse == true) return true;
|
|
|
|
|
if (m_bUse == true)
|
|
|
|
|
return true;
|
|
|
|
|
QWebElement notFound = Find(frame->documentElement(),"div","id","notfound");
|
|
|
|
|
if(notFound.isNull() == false)
|
|
|
|
|
{
|
|
|
|
|
@@ -577,7 +597,11 @@ bool SCrawler::saveFrameList(QWebFrame *frame)
|
|
|
|
|
|
|
|
|
|
{
|
|
|
|
|
QWebElement total = Find(eleMain,"span","class","title_num");
|
|
|
|
|
if (total.toPlainText().isEmpty()) {m_bError = true; return true;}
|
|
|
|
|
if (total.toPlainText().isEmpty())
|
|
|
|
|
{
|
|
|
|
|
m_bError = true;
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
int nTotal = GetNumber(total.toPlainText().split("/").at(1));
|
|
|
|
|
QStringList strList = m_strUrl.split("&");
|
|
|
|
|
bool ok = false;
|
|
|
|
|
@@ -817,7 +841,7 @@ bool SCrawler::saveFrameUrl(QWebFrame *frame)
|
|
|
|
|
|
|
|
|
|
void SCrawler::reloadPage()
|
|
|
|
|
{
|
|
|
|
|
//qDebug() << "reloadPage called";
|
|
|
|
|
// qDebug() << "reloadPage called";
|
|
|
|
|
saveResult(true);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@@ -1059,7 +1083,11 @@ void SCrawler::saveFrameCafeList(QWebFrame *frame)
|
|
|
|
|
|
|
|
|
|
{
|
|
|
|
|
QWebElement total = Find(eleMain,"span","class","title_num");
|
|
|
|
|
if (total.toPlainText().isEmpty()) {m_bError = true; return;}
|
|
|
|
|
if (total.toPlainText().isEmpty())
|
|
|
|
|
{
|
|
|
|
|
m_bError = true;
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
total.toPlainText().split("/").size();
|
|
|
|
|
int nTotal = GetNumber(total.toPlainText().split("/").at(1));
|
|
|
|
|
QStringList strList = m_strUrl.split("&");
|
|
|
|
|
@@ -1377,7 +1405,11 @@ void SCrawler::saveFrameDaumCafeList(QWebFrame *frame)
|
|
|
|
|
b_last = b_last | !(Find(frame->documentElement(), "div", "class", "result_message mg_cont").isNull());
|
|
|
|
|
|
|
|
|
|
QWebElement total = Find(eleMain,"span","class","f_nb f_l");
|
|
|
|
|
if (total.toPlainText().isEmpty()) {m_bError = true; return;}
|
|
|
|
|
if (total.toPlainText().isEmpty())
|
|
|
|
|
{
|
|
|
|
|
m_bError = true;
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
QString strTotal = total.toPlainText().split("/").at(1);
|
|
|
|
|
strTotal = strTotal.replace(",","");
|
|
|
|
|
@@ -1635,47 +1667,66 @@ void SCrawler::saveFrameDaumBlogList(QWebFrame *frame){}
|
|
|
|
|
|
|
|
|
|
void SCrawler::saveFrameNewsList(QWebFrame *frame)
|
|
|
|
|
{
|
|
|
|
|
if (m_bUse == true) return;
|
|
|
|
|
if (m_bUse == true)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// QFile file("pagedata.txt");
|
|
|
|
|
// if ( file.open(QIODevice::ReadWrite) )
|
|
|
|
|
// {
|
|
|
|
|
// QTextStream stream( &file );
|
|
|
|
|
// stream << frame->documentElement().toOuterXml() << endl;
|
|
|
|
|
// file.close();
|
|
|
|
|
// }
|
|
|
|
|
|
|
|
|
|
QWebElement notFound = Find(frame->documentElement(),"div","class","no_content");
|
|
|
|
|
if(notFound.isNull() == false)
|
|
|
|
|
{
|
|
|
|
|
m_bLast = true;
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
QWebElement eleMain = Find(frame->documentElement(),"div","class","srch_result_area headline");
|
|
|
|
|
foreach(QWebElement eleSub,eleMain.findAll("div"))
|
|
|
|
|
{
|
|
|
|
|
if (eleSub.attribute("class") == QString("info"))
|
|
|
|
|
{
|
|
|
|
|
QString str = Find(eleSub,"a","class","go_naver").attribute("href");
|
|
|
|
|
if (str.trimmed().isEmpty()) continue;
|
|
|
|
|
if (str.contains("http://sports")) continue;
|
|
|
|
|
if (str.trimmed().isEmpty())
|
|
|
|
|
continue;
|
|
|
|
|
if (str.contains("http://sports"))
|
|
|
|
|
continue;
|
|
|
|
|
m_bNothing = true;
|
|
|
|
|
cout << "o " << str.toStdString() << endl;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
QStringList strTotal = bodydata.GetNumber(Find(frame->documentElement(),"span","class","result_num").toPlainText().trimmed());
|
|
|
|
|
QVector <int> vecTotal;
|
|
|
|
|
foreach(QString str,strTotal)
|
|
|
|
|
{
|
|
|
|
|
if (str.trimmed().isEmpty() == false)
|
|
|
|
|
vecTotal.push_back(str.toInt());
|
|
|
|
|
}
|
|
|
|
|
// QStringList strTotal = bodydata.GetNumber(Find(frame->documentElement(),"span","class","result_num").toPlainText().trimmed());
|
|
|
|
|
QWebElement Total = Find(frame->documentElement(), "div", "class", "title_desc");
|
|
|
|
|
QStringList nums = bodydata.GetNumber(Total.toPlainText());
|
|
|
|
|
|
|
|
|
|
if(nums.count() < 3)
|
|
|
|
|
{
|
|
|
|
|
m_bError = true;
|
|
|
|
|
m_bUse = true;
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
QVector <int> vecTotal;
|
|
|
|
|
vecTotal.push_back(nums[0].toInt());
|
|
|
|
|
vecTotal.push_back(nums[1].toInt());
|
|
|
|
|
vecTotal.push_back(nums[2].toInt());
|
|
|
|
|
|
|
|
|
|
if (vecTotal[0] >= vecTotal[1] || vecTotal[1] == vecTotal[2])
|
|
|
|
|
m_bLast = true;
|
|
|
|
|
|
|
|
|
|
if (vecTotal.size() == 3)
|
|
|
|
|
{
|
|
|
|
|
if (vecTotal[0] >= vecTotal[1]) m_bLast = true;
|
|
|
|
|
if (vecTotal[1] == vecTotal[2]) m_bLast = true;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
m_bError = true;
|
|
|
|
|
m_bUse = true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool SCrawler::saveFrameNewsUrl(QWebFrame *frame)
|
|
|
|
|
{
|
|
|
|
|
if (m_bUse) return true;
|
|
|
|
|
if (m_bUse)
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
{
|
|
|
|
|
QString strQuery = "delete from ";
|
|
|
|
|
@@ -2386,21 +2437,27 @@ bool SCrawler::setProxyFromFile()
|
|
|
|
|
//QNetworkAccessManager *manager = new QNetworkAccessManager;
|
|
|
|
|
|
|
|
|
|
switch(strList.size())
|
|
|
|
|
{
|
|
|
|
|
{
|
|
|
|
|
case 1:
|
|
|
|
|
cout << "p : " << strList.at(0).toStdString() << " from File" << endl;
|
|
|
|
|
{
|
|
|
|
|
cout << "p : " << strList.at(0).toStdString() << " from File" << endl;
|
|
|
|
|
|
|
|
|
|
//manager->setProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0))));
|
|
|
|
|
//m_page->setNetworkAccessManager(manager);
|
|
|
|
|
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0))));
|
|
|
|
|
break;
|
|
|
|
|
QNetworkProxy proxy(QNetworkProxy::HttpProxy, strList.at(0));
|
|
|
|
|
QNetworkProxy::setApplicationProxy(proxy);
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
case 2:
|
|
|
|
|
cout << "p : " << strList.at(0).toStdString() << ":" << strList.at(1).toStdString() << " from File" << endl;
|
|
|
|
|
{
|
|
|
|
|
cout << "p : " << strList.at(0).toStdString() << ":" << strList.at(1).toStdString() << " from File" << endl;
|
|
|
|
|
|
|
|
|
|
//manager->setProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt())));
|
|
|
|
|
//m_page->setNetworkAccessManager(manager);
|
|
|
|
|
|
|
|
|
|
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt())));
|
|
|
|
|
QNetworkProxy proxy(QNetworkProxy::HttpProxy, strList.at(0), strList.at(1).toInt());
|
|
|
|
|
QNetworkProxy::setApplicationProxy(proxy);
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
@@ -2439,11 +2496,12 @@ bool SCrawler::setProxyFromDb()
|
|
|
|
|
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0))));
|
|
|
|
|
break;
|
|
|
|
|
case 2:
|
|
|
|
|
cout << "p : " << strList.at(0).toStdString() << ":" << strList.at(1).toStdString() << " from DB" << endl;
|
|
|
|
|
m_strProxyIP = strList.at(0);
|
|
|
|
|
m_nProxyPort = strList.at(1).toInt();
|
|
|
|
|
m_strProxyIP = strList.at(0);
|
|
|
|
|
m_nProxyPort = strList.at(1).toInt();
|
|
|
|
|
|
|
|
|
|
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt())));
|
|
|
|
|
cout << "p : " << m_strProxyIP.toStdString() << ":" << m_nProxyPort << " from DB" << endl;
|
|
|
|
|
|
|
|
|
|
//QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy, m_strProxyIP, m_nProxyPort)));
|
|
|
|
|
/*
|
|
|
|
|
QString strProxyHost = "61.103.7.74";
|
|
|
|
|
int nPort = 2074;
|
|
|
|
|
@@ -2468,7 +2526,6 @@ bool SCrawler::setProxyFromDb()
|
|
|
|
|
void SCrawler::setProxy()
|
|
|
|
|
{
|
|
|
|
|
bool ok = setProxyFromFile() || setProxyFromDb();
|
|
|
|
|
//bool ok = false;
|
|
|
|
|
if (!ok)
|
|
|
|
|
cout << "No Proxy" << endl;
|
|
|
|
|
}
|
|
|
|
|
|