네이버 블로그, 카페 크롤러 문제 수정

This commit is contained in:
mjjo
2017-08-17 16:51:04 +09:00
parent 8854af26d6
commit 24587435b6
6 changed files with 158 additions and 85 deletions

View File

@@ -4,8 +4,8 @@
SNaverNewsManage::SNaverNewsManage(QObject *pObject) : SManage(pObject)
{
m_nID = 0;
connect(&m_pro[0], SIGNAL(readyReadStandardOutput()), this, SLOT());
m_nID = 0;
// connect(&m_pro[0], SIGNAL(readyReadStandardOutput()), this, SLOT());
}
QString SNaverNewsManage::makeGetListQuery(QString _str,QDate _date,int _nPage)

View File

@@ -10,7 +10,12 @@ using namespace std;
int main(int argc, char *argv[])
{
srand(time(0));
// cout << "arguments: ";
// for(int i=0; i<argc; i++)
// cout << " " << argv[i];
// cout << endl;
srand(time(0));
QApplication a(argc, argv);
a.setApplicationName(QString("Chrome"));
a.setApplicationVersion(QString("50.0.2661.102"));
@@ -39,5 +44,6 @@ int main(int argc, char *argv[])
process->load(strArgv);
a.exec();
delete process;
return 0;
}

View File

@@ -38,11 +38,18 @@ SCrawler::SCrawler():QObject()
m_nRetryCount = 0;
m_bProcessed = false;
connect(m_page, SIGNAL(loadFinished(bool)), this, SLOT(saveResult(bool)));
srand(time(NULL));
srand(time(NULL));
m_pNAM = new QNetworkAccessManager(this);
m_page->setNetworkAccessManager(m_pNAM);
}
SCrawler::~SCrawler()
{
m_page->setNetworkAccessManager(nullptr);
delete m_pNAM;
delete m_page;
}
void SCrawler::load(QStringList _strlistArgv)
@@ -159,22 +166,23 @@ void SCrawler::load(QStringList _strlistArgv)
}
cout << m_strUrl.toStdString() << endl;
QUrl url = QUrl(m_strUrl);
QUrl url = QUrl(m_strUrl);
if (url.scheme().isEmpty())
url.setScheme("http");
m_page->settings()->setAttribute(QWebSettings::AutoLoadImages,false);
QNetworkRequest *request = new QNetworkRequest;
request->setUrl(url);
QNetworkRequest request;
request.setUrl(url);
/*
request->setRawHeader("Cache-Control","max-age=0, no-cache");
request->setRawHeader("Pragma","no-cache");
request->setRawHeader("Expires","Thu, 01 Jan 1970 16:00:00 GMT");
request.setRawHeader("Cache-Control","max-age=0, no-cache");
request.setRawHeader("Pragma","no-cache");
request.setRawHeader("Expires","Thu, 01 Jan 1970 16:00:00 GMT");
*/
if (m_strReper.isEmpty() == false && m_nSelect == E_NAVER_CAFE_DATA)
request->setRawHeader("Referer",m_strReper.toLocal8Bit());
request->setRawHeader("Accept-Language","ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4,zh-CN;q=0.2,zh;q=0.2");
m_page->mainFrame()->load(*request);
if (m_strReper.isEmpty() == false && m_nSelect == E_NAVER_CAFE_DATA)
request.setRawHeader("Referer",m_strReper.toLocal8Bit());
request.setRawHeader("Accept-Language","ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4,zh-CN;q=0.2,zh;q=0.2");
m_page->mainFrame()->load(request);
m_bLast = false;
m_bError = false;
}
@@ -198,6 +206,8 @@ void SCrawler::saveResult(bool ok)
{
// qDebug() << "saveResult";
// cout << "page data: "<< m_page->bytesReceived() << endl;
if (!ok)
{
cout << "Failed loading";
@@ -209,14 +219,17 @@ void SCrawler::saveResult(bool ok)
//qDebug() << "load complete";
switch(m_nSelect)
{
case E_NAVER_NEWS_LIST:saveFrameNewsList(m_page->mainFrame());break;
case E_NAVER_NEWS_LIST:
saveFrameNewsList(m_page->mainFrame());
break;
case E_NAVER_NEWS_DATA:
{
static bool loaded = false;
if(!loaded)
{
loaded = true;
if(!saveFrameNewsUrl(m_page->mainFrame()))
if(saveFrameNewsUrl(m_page->mainFrame()) == false)
{
loaded = false;
return;
@@ -229,48 +242,57 @@ void SCrawler::saveResult(bool ok)
break;
}
case E_NAVER_NEWS_REPLY:
{
if(!saveFrameNewsComment(m_page->mainFrame()))
return;
break;
}
case E_NAVER_CAFE_LIST:saveFrameCafeList(m_page->mainFrame());break;
case E_NAVER_CAFE_LIST:
saveFrameCafeList(m_page->mainFrame());
break;
case E_NAVER_CAFE_DATA:
{
saveFrameCafeUrl(m_page->mainFrame());
bodydata.sendDB();
break;
}
case E_NAVER_BLOG_LIST:
{
if(saveFrameList(m_page->mainFrame()))
break;
else
return;
}
case E_NAVER_BLOG_BODY:
{
if(!saveFrameUrl(m_page->mainFrame()))
case E_NAVER_BLOG_LIST:
if(saveFrameList(m_page->mainFrame()) == false)
return;
break;
case E_NAVER_BLOG_BODY:
if(saveFrameUrl(m_page->mainFrame()) == false)
return;
bodydata.sendDB();
break;
}
case E_NAVER_BLOG_REPLY:saveFrameComment(m_page->mainFrame());break;
case E_DAUM_CAFE_LIST:saveFrameDaumCafeList(m_page->mainFrame());break;
case E_NAVER_BLOG_REPLY:
saveFrameComment(m_page->mainFrame());
break;
case E_DAUM_CAFE_LIST:
saveFrameDaumCafeList(m_page->mainFrame());
break;
case E_DAUM_CAFE_DATA:
{
saveFrameDaumCafeUrl(m_page->mainFrame());
bodydata.sendDB();
break;
}
case E_DAUM_BLOG_LIST:saveFrameDaumBlogList(m_page->mainFrame());break;
case E_DAUM_BLOG_LIST:
saveFrameDaumBlogList(m_page->mainFrame());
break;
case E_DAUM_BLOG_BODY:
{
saveFrameDaumBlogUrl(m_page->mainFrame());
bodydata.sendDB();
break;
}
case E_DAUM_BLOG_REPLY:saveFrameDaumBlogComment(m_page->mainFrame());break;
case E_DAUM_BLOG_REPLY:
saveFrameDaumBlogComment(m_page->mainFrame());
break;
}
switch(m_nSelect)
@@ -296,8 +318,8 @@ void SCrawler::saveResult(bool ok)
cout << "last";
m_bLast = false;
}
break;
case E_NAVER_BLOG_REPLY:
case E_NAVER_NEWS_REPLY:
case E_DAUM_BLOG_REPLY:
@@ -327,9 +349,9 @@ void SCrawler::saveResult(bool ok)
}
break;
}
qDebug() << "finish";
emit finished();
qDebug() << " finish";
emit finished();
}
int SCrawler::GetNumber(QString _str)
@@ -407,15 +429,13 @@ void SCrawler::reloadListPage()
bool SCrawler::saveFrameList(QWebFrame *frame)
{
if (m_bProcessed == false)
m_bProcessed = true;
else
return false;
//qDebug() << frame->documentElement().toPlainText();
if (m_bUse == true) return true;
if (m_bUse == true)
return true;
QWebElement notFound = Find(frame->documentElement(),"div","id","notfound");
if(notFound.isNull() == false)
{
@@ -577,7 +597,11 @@ bool SCrawler::saveFrameList(QWebFrame *frame)
{
QWebElement total = Find(eleMain,"span","class","title_num");
if (total.toPlainText().isEmpty()) {m_bError = true; return true;}
if (total.toPlainText().isEmpty())
{
m_bError = true;
return true;
}
int nTotal = GetNumber(total.toPlainText().split("/").at(1));
QStringList strList = m_strUrl.split("&");
bool ok = false;
@@ -817,7 +841,7 @@ bool SCrawler::saveFrameUrl(QWebFrame *frame)
void SCrawler::reloadPage()
{
//qDebug() << "reloadPage called";
// qDebug() << "reloadPage called";
saveResult(true);
}
@@ -1059,7 +1083,11 @@ void SCrawler::saveFrameCafeList(QWebFrame *frame)
{
QWebElement total = Find(eleMain,"span","class","title_num");
if (total.toPlainText().isEmpty()) {m_bError = true; return;}
if (total.toPlainText().isEmpty())
{
m_bError = true;
return;
}
total.toPlainText().split("/").size();
int nTotal = GetNumber(total.toPlainText().split("/").at(1));
QStringList strList = m_strUrl.split("&");
@@ -1377,7 +1405,11 @@ void SCrawler::saveFrameDaumCafeList(QWebFrame *frame)
b_last = b_last | !(Find(frame->documentElement(), "div", "class", "result_message mg_cont").isNull());
QWebElement total = Find(eleMain,"span","class","f_nb f_l");
if (total.toPlainText().isEmpty()) {m_bError = true; return;}
if (total.toPlainText().isEmpty())
{
m_bError = true;
return;
}
QString strTotal = total.toPlainText().split("/").at(1);
strTotal = strTotal.replace(",","");
@@ -1635,47 +1667,66 @@ void SCrawler::saveFrameDaumBlogList(QWebFrame *frame){}
void SCrawler::saveFrameNewsList(QWebFrame *frame)
{
if (m_bUse == true) return;
if (m_bUse == true)
return;
// QFile file("pagedata.txt");
// if ( file.open(QIODevice::ReadWrite) )
// {
// QTextStream stream( &file );
// stream << frame->documentElement().toOuterXml() << endl;
// file.close();
// }
QWebElement notFound = Find(frame->documentElement(),"div","class","no_content");
if(notFound.isNull() == false)
{
m_bLast = true;
return;
}
QWebElement eleMain = Find(frame->documentElement(),"div","class","srch_result_area headline");
foreach(QWebElement eleSub,eleMain.findAll("div"))
{
if (eleSub.attribute("class") == QString("info"))
{
QString str = Find(eleSub,"a","class","go_naver").attribute("href");
if (str.trimmed().isEmpty()) continue;
if (str.contains("http://sports")) continue;
if (str.trimmed().isEmpty())
continue;
if (str.contains("http://sports"))
continue;
m_bNothing = true;
cout << "o " << str.toStdString() << endl;
}
}
QStringList strTotal = bodydata.GetNumber(Find(frame->documentElement(),"span","class","result_num").toPlainText().trimmed());
QVector <int> vecTotal;
foreach(QString str,strTotal)
{
if (str.trimmed().isEmpty() == false)
vecTotal.push_back(str.toInt());
}
// QStringList strTotal = bodydata.GetNumber(Find(frame->documentElement(),"span","class","result_num").toPlainText().trimmed());
QWebElement Total = Find(frame->documentElement(), "div", "class", "title_desc");
QStringList nums = bodydata.GetNumber(Total.toPlainText());
if(nums.count() < 3)
{
m_bError = true;
m_bUse = true;
return;
}
QVector <int> vecTotal;
vecTotal.push_back(nums[0].toInt());
vecTotal.push_back(nums[1].toInt());
vecTotal.push_back(nums[2].toInt());
if (vecTotal[0] >= vecTotal[1] || vecTotal[1] == vecTotal[2])
m_bLast = true;
if (vecTotal.size() == 3)
{
if (vecTotal[0] >= vecTotal[1]) m_bLast = true;
if (vecTotal[1] == vecTotal[2]) m_bLast = true;
}
else
m_bError = true;
m_bUse = true;
}
bool SCrawler::saveFrameNewsUrl(QWebFrame *frame)
{
if (m_bUse) return true;
if (m_bUse)
return true;
{
QString strQuery = "delete from ";
@@ -2386,21 +2437,27 @@ bool SCrawler::setProxyFromFile()
//QNetworkAccessManager *manager = new QNetworkAccessManager;
switch(strList.size())
{
{
case 1:
cout << "p : " << strList.at(0).toStdString() << " from File" << endl;
{
cout << "p : " << strList.at(0).toStdString() << " from File" << endl;
//manager->setProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0))));
//m_page->setNetworkAccessManager(manager);
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0))));
break;
QNetworkProxy proxy(QNetworkProxy::HttpProxy, strList.at(0));
QNetworkProxy::setApplicationProxy(proxy);
}
break;
case 2:
cout << "p : " << strList.at(0).toStdString() << ":" << strList.at(1).toStdString() << " from File" << endl;
{
cout << "p : " << strList.at(0).toStdString() << ":" << strList.at(1).toStdString() << " from File" << endl;
//manager->setProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt())));
//m_page->setNetworkAccessManager(manager);
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt())));
QNetworkProxy proxy(QNetworkProxy::HttpProxy, strList.at(0), strList.at(1).toInt());
QNetworkProxy::setApplicationProxy(proxy);
}
break;
}
}
@@ -2439,11 +2496,12 @@ bool SCrawler::setProxyFromDb()
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0))));
break;
case 2:
cout << "p : " << strList.at(0).toStdString() << ":" << strList.at(1).toStdString() << " from DB" << endl;
m_strProxyIP = strList.at(0);
m_nProxyPort = strList.at(1).toInt();
m_strProxyIP = strList.at(0);
m_nProxyPort = strList.at(1).toInt();
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt())));
cout << "p : " << m_strProxyIP.toStdString() << ":" << m_nProxyPort << " from DB" << endl;
//QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy, m_strProxyIP, m_nProxyPort)));
/*
QString strProxyHost = "61.103.7.74";
int nPort = 2074;
@@ -2468,7 +2526,6 @@ bool SCrawler::setProxyFromDb()
void SCrawler::setProxy()
{
bool ok = setProxyFromFile() || setProxyFromDb();
//bool ok = false;
if (!ok)
cout << "No Proxy" << endl;
}

View File

@@ -6,6 +6,7 @@
class SCrawler : public QObject
{
Q_OBJECT
public:
enum E_SELECT
{
@@ -25,7 +26,7 @@ public:
};
public:
SCrawler();
~SCrawler();
virtual ~SCrawler();
void load(QStringList _strlistArgv);
void saveFile();
static void Debug(QString _strFilename,QString _strData);
@@ -35,6 +36,7 @@ private slots:
void saveResult(bool ok);
void reloadPage();
void reloadListPage();
private:
int m_nSelect;
QString m_strReper;
@@ -43,6 +45,7 @@ private:
SCrawlerData bodydata;
QWebPage *m_page;
QNetworkAccessManager* m_pNAM;
QString m_strFile;
QString m_strUrl;
QString m_strTable;

View File

@@ -136,6 +136,9 @@ QStringList SCrawlerData::GetNumber(QString _str)
{
if (pch[i].isNumber() || pch[i].isSpace())
str += pch[i];
else if(pch[i] != ',' && pch[i] != '.')
str += ' ';
}
return str.trimmed().split(" ");
return str.trimmed().split(" ", QString::SkipEmptyParts);
}

View File

@@ -25,9 +25,13 @@ from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
def is_debugger_attached():
for frame in inspect.stack():
if frame[1].endswith("pydevd.py"):
return True
try:
for frame in inspect.stack():
if frame[1].endswith("pydevd.py"):
return True
except:
return False
return False
is_debug = is_debugger_attached()