diff --git a/CrawlerProcess/CrawlerProcess.pro b/CrawlerProcess/CrawlerProcess.pro index 4d469ca..9a4b9f7 100644 --- a/CrawlerProcess/CrawlerProcess.pro +++ b/CrawlerProcess/CrawlerProcess.pro @@ -13,7 +13,7 @@ TEMPLATE = app SOURCES += main.cpp \ scrawler.cpp \ - scrawler_backup.cpp + HEADERS += \ scrawler.h diff --git a/CrawlerProcess/scrawler.cpp b/CrawlerProcess/scrawler.cpp index 0fd56e3..ef64d37 100644 --- a/CrawlerProcess/scrawler.cpp +++ b/CrawlerProcess/scrawler.cpp @@ -67,7 +67,7 @@ void SCrawler::load(QStringList _strlistArgv) { m_strUrl = _strlistArgv[2]; m_nSelect = E_NAVER_CAFE_DATA; - m_strReper = _strlistArgv[4]; + m_strReper = _strlistArgv[4]; } if (_strlistArgv[1] == "blog_list") @@ -76,6 +76,33 @@ void SCrawler::load(QStringList _strlistArgv) m_nSelect = E_NAVER_BLOG_LIST; m_strKeywordID = _strlistArgv[4]; //cout << "ok"; + + QFile file("proxy.txt"); + if (file.open(QIODevice::ReadOnly | QIODevice::Text)) + { + QVector vecProxy; + while (!file.atEnd()) + { + QString str = QString(file.readLine()); + if (str.isEmpty()) continue; + vecProxy.push_back(str.split(",")); + } + if (vecProxy.size() > 0) + { + QStringList strList = vecProxy.at(rand()%vecProxy.size()); + switch(strList.size()) + { + case 1: + cout << "p : " << strList.at(0).toStdString() << endl; + QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0)))); + break; + case 2: + cout << "p : " << strList.at(0).toStdString() << endl; + QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt()))); + break; + } + } + } } if (_strlistArgv[1] == "blog_url") @@ -241,17 +268,16 @@ void SCrawler::saveFrameList(QWebFrame *frame) m_bUse = true; return; } - cout << "url : " << strUrl.toStdString(); QStringList strList = strUrl.split('/'); - if (strList.at(0).compare("blog.naver.com") != 0 ) { cout << " not" << endl; continue; }; + if (strList.at(0).compare("blog.naver.com") != 0 ) { cout << "x http://" << strUrl.toStdString() <= 991) m_bLast = true; m_strListURL.clear(); + foreach(QString str,_strOut.split("\n")) + { + if (str.isEmpty()) continue; + if (str.at(0) == QChar('o')) + m_strListURL.push_back(str.right(str.length()-2).trimmed()); + } + /* QSqlQuery query; if(query.exec("SELECT ARTICLE_URL FROM " + C_TABLE_URL + QString::number(m_nUrlTable) + " where ERROR is null")) { @@ -66,6 +73,7 @@ void SNaverBlogManage::processFinished(QProcess *pPro,QString _strOut) while (query.next()) m_strListURL.append(query.value(0).toString()); + */ m_ncUrl = 0; if (m_strListURL.size() == 0) { @@ -108,7 +116,8 @@ bool SNaverBlogManage::Update() m_pMain->InsertLog(m_nID,"Start : " + QString::number(m_ncList) + " Date : " + m_date.toString("yyyy-MM-dd")); //for(int i = 0; i < C_PROCESS_MAX ; i++) { - m_pro[0].start("CrawlerProcess",QStringList()<< "naver" << "blog_list" << QString::number(m_nUrlTable) << m_strQuery << m_strKeywordID); + m_pro[0].start("CrawlerProcess",QStringList()<< "naver" << "blog_list" << m_strQuery << m_strGroupID << m_strKeywordID); + // m_pro[0].SetState(SProcess::STATE_RUNNING); m_ncList+=10; } m_nMode = E_PROCESS_LIST_FINISH_WAIT; @@ -119,13 +128,11 @@ bool SNaverBlogManage::Update() if (UseProcess() == false) { /* - m_pMain->InsertLog(m_nID,"(" + QString::number(m_ncUrl+1) + "/" + QString::number(m_strListURL.size()) + ")"); for(int i = 0; i < C_PROCESS_MAX ; i++) { m_pro[0].start("CrawlerProcess",QStringList() << m_strListURL.at(m_ncUrl) << "url"); } - */ - /* + int i = 0; foreach(QString strUrl,m_strListURL) //for(int i = 0; i < C_PROCESS_MAX ; i++) @@ -135,7 +142,9 @@ bool SNaverBlogManage::Update() } */ //m_pro[0].start("CrawlerProcess",QStringList() << m_strListURL.at(m_ncUrl) << "blog_url" << C_TABLE_URL + QString::number(m_nUrlTable)); - m_pro[0].start("CrawlerProcess",QStringList() << "naver" << "blog_url" << QString::number(m_nUrlTable) << m_strListURL.at(m_ncUrl) << makeGetCommentQuery(m_strListURL.at(m_ncUrl)) << "" ); + m_pMain->InsertLog(m_nID,"(" + QString::number(m_ncUrl+1) + "/" + QString::number(m_strListURL.size()) + ")"); + m_pro[0].start("CrawlerProcess",QStringList() << "naver" << "blog_url" << m_strListURL.at(m_ncUrl) << m_strGroupID << "" ); +// m_pro[0].SetState(SProcess::STATE_RUNNING); m_nMode = E_PROCESS_URL_FINISH_WAIT; m_nWait = 0; } @@ -150,8 +159,10 @@ bool SNaverBlogManage::Update() //if (i >= C_PROCESS_MAX) break; //m_ncUrl++; } - m_pro[0].start("CrawlerProcess",QStringList() << "naver" << "blog_comm" << QString::number(m_nUrlTable) << makeGetCommentQuery(m_strListURL.at(m_ncUrl++)) << "" ); + //m_pMain->InsertLog(m_nID,"(" + QString::number(m_ncUrl+1) + "/" + QString::number(m_strListURL.size()) + ")"); + m_pro[0].start("CrawlerProcess",QStringList() << "naver" << "blog_comm" << makeGetCommentQuery(m_strListURL.at(m_ncUrl++)) << m_strGroupID << "" ); //m_pro[0].start("CrawlerProcess",QStringList() << makeGetCommentQuery(m_strListURL.at(m_ncUrl++))<< "blog_comm" << C_TABLE_COM + QString::number(m_nUrlTable)); + // m_pro[0].SetState(SProcess::STATE_RUNNING); m_nMode = E_PROCESS_COMMENT_FINISH_WAIT; m_nWait = 0; } @@ -162,12 +173,15 @@ bool SNaverBlogManage::Update() m_nWait++; if (m_nWait > (1000000/m_nTime)) { - for(int i = 0; i < C_PROCESS_MAX ; i++) + //for(int i = 0; i < C_PROCESS_MAX ; i++) { - m_pro[i].kill(); - m_pMain->InsertLog(m_nID,"Kill Process."); + { + m_pro[0].kill(); + m_pMain->InsertLog(m_nID,"Kill Process."); + } } if (m_nMode == E_PROCESS_LIST_FINISH_WAIT) return m_bFinalLast; + /* QString strQuery = "update blog set "; strQuery += "Error "; strQuery += "='"; @@ -181,6 +195,7 @@ bool SNaverBlogManage::Update() strQuery += "'"; QSqlQuery sql; sql.exec(strQuery); + */ } break; }