diff --git a/CrawlerProcess/main.cpp b/CrawlerProcess/main.cpp index 440ccd2..a3bff35 100644 --- a/CrawlerProcess/main.cpp +++ b/CrawlerProcess/main.cpp @@ -13,8 +13,8 @@ int main(int argc, char *argv[]) srand(time(0)); QApplication a(argc, argv); a.setApplicationName(QString("Chrome")); - a.setApplicationVersion(QString("39.0.2171.95")); - + a.setApplicationVersion(QString("50.0.2661.102")); + //39.0.2171.95 QSqlDatabase db = QSqlDatabase::addDatabase("QMYSQL"); db.setHostName("bigbird.iptime.org"); db.setUserName("admin"); diff --git a/CrawlerProcess/scrawler.cpp b/CrawlerProcess/scrawler.cpp index ecc4292..96fa6d0 100644 --- a/CrawlerProcess/scrawler.cpp +++ b/CrawlerProcess/scrawler.cpp @@ -6,6 +6,8 @@ #include #include #include +#include +#include using namespace std; @@ -22,7 +24,9 @@ SCrawler::SCrawler():QObject() { m_page = new QWebPage; m_nRetryCount = 0; + m_bProcessed = false; connect(m_page, SIGNAL(loadFinished(bool)), this, SLOT(saveResult(bool))); + srand(time(NULL)); } SCrawler::~SCrawler() @@ -141,7 +145,6 @@ void SCrawler::load(QStringList _strlistArgv) } m_strTable = "data_" + _strlistArgv[3]; } - cout << m_strUrl.toStdString() << endl; QUrl url = QUrl(m_strUrl); @@ -181,6 +184,8 @@ void SCrawler::UpdateError(QString _strError) void SCrawler::saveResult(bool ok) { + qDebug() << "saveResult"; + if (!ok) { cout << "Failed loading"; @@ -188,7 +193,7 @@ void SCrawler::saveResult(bool ok) emit finished(); return; } - qDebug() << "load complete"; + //qDebug() << "load complete"; switch(m_nSelect) { case E_NAVER_NEWS_LIST:saveFrameNewsList(m_page->mainFrame());break; @@ -216,7 +221,13 @@ void SCrawler::saveResult(bool ok) bodydata.sendDB(); break; } - case E_NAVER_BLOG_LIST:saveFrameList(m_page->mainFrame());break; + case E_NAVER_BLOG_LIST: + { + if(saveFrameList(m_page->mainFrame())) + break; + else + return; + } case E_NAVER_BLOG_BODY: { if(!saveFrameUrl(m_page->mainFrame())) @@ -297,6 +308,7 @@ void SCrawler::saveResult(bool ok) break; } emit finished(); + } int SCrawler::GetNumber(QString _str) @@ -310,6 +322,19 @@ int SCrawler::GetNumber(QString _str) return strNumber.toInt(); } + +int SCrawler::GetNumber(QString _str, bool &ok) +{ + QString strNumber; + for (int i = 0; i < _str.size();i++) + { + if (_str.at(i).isNumber()) + strNumber += _str.at(i); + } + return strNumber.toInt(&ok); +} + + void SCrawler::Debug(QString _strFilename,QString _strData) { QFile file(_strFilename); @@ -345,20 +370,55 @@ QString SCrawler::GetSafeUtf(QString _strData) return str; } -void SCrawler::saveFrameList(QWebFrame *frame) +void SCrawler::reloadListPage() { - if (m_bUse == true) return; + ++m_nRetryCount; + if (m_nRetryCount >= RETRY_MAX) + { + cout << "block"; + emit finished(); + return; + } + m_bProcessed = false; + saveResult(true); +} + +bool SCrawler::saveFrameList(QWebFrame *frame) +{ + + if (m_bProcessed == false) + m_bProcessed = true; + else + return false; + + //qDebug() << frame->documentElement().toPlainText(); + + if (m_bUse == true) return true; QWebElement notFound = Find(frame->documentElement(),"div","id","notfound"); if(notFound.isNull() == false) { m_bLast = true; - return; + return true; + } + + QWebElement naverBlock = Find(frame->documentElement(), "p", "class", "info01"); + if (!naverBlock.isNull()) + { + m_bError = true; + cout << "naver"; + return true; } QWebElement eleMain = Find(frame->documentElement(),"div","class","blog section _blogBase"); QStringList urlList; + if (eleMain.isNull()) + { + QTimer::singleShot(RETRY_INTERVAL, this, SLOT(reloadListPage())); + return false; + } + for (int i = 0; i < 10 ; i++) { QString str = "sp_blog_"; @@ -495,15 +555,21 @@ void SCrawler::saveFrameList(QWebFrame *frame) } { - QWebElement total = Find(eleMain,"span","class","title_num"); - if (total.toPlainText().isEmpty()) {m_bError = true; return;} + if (total.toPlainText().isEmpty()) {m_bError = true; return true;} int nTotal = GetNumber(total.toPlainText().split("/").at(1)); QStringList strList = m_strUrl.split("&"); - int nNow = GetNumber(strList.at(strList.size() - 1).split("=").at(1)); + bool ok = false; + int nNow = GetNumber(strList.at(strList.size() - 1).split("=").at(1), ok); + if (!ok) + { + m_bError = true; + return true; + } if ((nNow + 10) > nTotal || nNow >= 1000) m_bLast = true; } + return true; } enum E_DATA @@ -873,6 +939,15 @@ void SCrawler::saveFrameCafeList(QWebFrame *frame) m_bLast = true; return; } + + QWebElement naverBlock = Find(frame->documentElement(), "p", "class", "info01"); + if (!naverBlock.isNull()) + { + m_bError = true; + cout << "naver"; + return; + } + QStringList urlList; QWebElement eleMain = Find(frame->documentElement(),"div","class","cafe_article section _cafeBase"); @@ -1687,6 +1762,7 @@ bool SCrawler::saveFrameNewsComment(QWebFrame *frame) qDebug() << frame->baseUrl().toString(); qDebug() << "executed"; + if(frame->baseUrl().toString().contains("entertain") && !frame->baseUrl().toString().contains("comment")) { m_page->mainFrame()->load(QUrl(frame->baseUrl().toString().replace("read", "comment/list"))); @@ -2030,12 +2106,74 @@ bool SCrawler::getProxyList(QString &_str) str += "\n"; _str += str; } + return true; } -void SCrawler::setProxy() +bool SCrawler::setProxyFromFile() +{ + QFile file("proxy.txt"); + QRegExp rx("^\\s*([\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3})[^\\d]*([\\d]*)"); + + if (file.open(QIODevice::ReadOnly | QIODevice::Text)) + { + QVector vecProxy; + while (!file.atEnd()) + { + QString str = QString(file.readLine()); + if (str.isEmpty()) continue; + int pos = 0; + QStringList strList; + while ((pos = rx.indexIn(str, pos)) != -1) + { + if (!rx.cap(1).isEmpty()) + strList.append(rx.cap(1)); + if (!rx.cap(2).isEmpty()) + strList.append(rx.cap(2)); + pos += rx.matchedLength(); + } + if (!strList.isEmpty()) + vecProxy.push_back(strList); + } + if (vecProxy.size() > 0) + { + QStringList strList = vecProxy.at(rand()%vecProxy.size()); + //QNetworkAccessManager *manager = new QNetworkAccessManager; + + switch(strList.size()) + { + case 1: + cout << "p : " << strList.at(0).toStdString() << " from File" << endl; + + //manager->setProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0)))); + //m_page->setNetworkAccessManager(manager); + QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0)))); + break; + case 2: + cout << "p : " << strList.at(0).toStdString() << ":" << strList.at(1).toStdString() << " from File" << endl; + + //manager->setProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt()))); + //m_page->setNetworkAccessManager(manager); + + QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt()))); + break; + } + } + else + { + return false; + } + file.close(); + return true; + } + else + return false; +} + +bool SCrawler::setProxyFromDb() { QString proxyList; + if (getProxyList(proxyList)) { QVector vecProxy; @@ -2050,81 +2188,45 @@ void SCrawler::setProxy() { QStringList strList = vecProxy.at(rand()%vecProxy.size()); switch(strList.size()) - { + { case 1: - cout << "p : " << strList.at(0).toStdString() << endl; + cout << "p : " << strList.at(0).toStdString() << " from DB" << endl; QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0)))); - break; - case 2: - cout << "p : " << strList.at(0).toStdString() << ":" << strList.at(1).toStdString() << endl; - QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt()))); + break; + case 2: + cout << "p : " << strList.at(0).toStdString() << ":" << strList.at(1).toStdString() << " from DB" << endl; m_strProxyIP = strList.at(0); m_nProxyPort = strList.at(1).toInt(); - //QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,"196.201.216.172",8088))); + + QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt()))); + /* + QString strProxyHost = "61.103.7.74"; + int nPort = 2074; + QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strProxyHost,nPort))); + */ break; } + return true; } else { - QFile file("proxy.txt"); - if (file.open(QIODevice::ReadOnly | QIODevice::Text)) - { - QVector vecProxy; - while (!file.atEnd()) - { - QString str = QString(file.readLine()); - if (str.isEmpty()) continue; - vecProxy.push_back(str.split(",")); - } - if (vecProxy.size() > 0) - { - QStringList strList = vecProxy.at(rand()%vecProxy.size()); - switch(strList.size()) - { - case 1: - cout << "p : " << strList.at(0).toStdString() << endl; - QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0)))); - break; - case 2: - cout << "p : " << strList.at(0).toStdString() << endl; - QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt()))); - break; - } - } - } + return false; } } else { - QFile file("proxy.txt"); - if (file.open(QIODevice::ReadOnly | QIODevice::Text)) - { - QVector vecProxy; - while (!file.atEnd()) - { - QString str = QString(file.readLine()); - if (str.isEmpty()) continue; - vecProxy.push_back(str.split(",")); - } - if (vecProxy.size() > 0) - { - QStringList strList = vecProxy.at(rand()%vecProxy.size()); - switch(strList.size()) - { - case 1: - cout << "p : " << strList.at(0).toStdString() << endl; - QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0)))); - break; - case 2: - cout << "p : " << strList.at(0).toStdString() << endl; - QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt()))); - break; - } - } - } + return false; } } + +void SCrawler::setProxy() +{ + bool ok = setProxyFromFile() || setProxyFromDb(); + if (!ok) + cout << "No Proxy" << endl; +} + void SCrawler::deleteProxy() { if (m_strProxyIP.isEmpty()) return; diff --git a/CrawlerProcess/scrawler.h b/CrawlerProcess/scrawler.h index 984caea..497522b 100644 --- a/CrawlerProcess/scrawler.h +++ b/CrawlerProcess/scrawler.h @@ -34,6 +34,7 @@ signals: private slots: void saveResult(bool ok); void reloadPage(); + void reloadListPage(); private: int m_nSelect; QString m_strReper; @@ -50,12 +51,14 @@ private: bool m_bLast; bool m_bError; bool m_bNothing; + bool m_bProcessed; + int m_nRetryCount; QString m_strProxyIP; int m_nProxyPort; - int m_nRetryCount; + QString SqlString(QString _str); QString GetSafeUtf(QString _strData); - void saveFrameList(QWebFrame *frame); + bool saveFrameList(QWebFrame *frame); void saveFrameCafeList(QWebFrame *frame); bool saveFrameUrl(QWebFrame *frame); void saveFrameComment(QWebFrame *frame); @@ -70,7 +73,11 @@ private: bool saveFrameNewsComment(QWebFrame *frame); int GetNumber(QString _str); + int GetNumber(QString _str, bool &ok); + bool getProxyList(QString &_str); + bool setProxyFromFile(); + bool setProxyFromDb(); void setProxy(); void deleteProxy();