Compare commits
6 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
24587435b6 | ||
|
|
8854af26d6 | ||
|
|
ed7a6ddad9 | ||
|
|
87968097a9 | ||
|
|
3142782428 | ||
|
|
aa2f5b9f71 |
12
.gitignore
vendored
12
.gitignore
vendored
@@ -1,8 +1,12 @@
|
|||||||
**/.idea/
|
|
||||||
**/__pycache__/
|
**/__pycache__/
|
||||||
*.user
|
**/.idea/
|
||||||
**/build-*/
|
**/build-*/
|
||||||
WebBasedCrawler/proxy.txt
|
|
||||||
clients-win/
|
clients-win/
|
||||||
clients-linux/
|
clients-linux/
|
||||||
**/*.log
|
|
||||||
|
*.user
|
||||||
|
*.csv
|
||||||
|
*.log
|
||||||
|
*.bak
|
||||||
|
|
||||||
|
WebBasedCrawler/proxy.txt
|
||||||
|
|||||||
@@ -5,7 +5,7 @@
|
|||||||
SNaverNewsManage::SNaverNewsManage(QObject *pObject) : SManage(pObject)
|
SNaverNewsManage::SNaverNewsManage(QObject *pObject) : SManage(pObject)
|
||||||
{
|
{
|
||||||
m_nID = 0;
|
m_nID = 0;
|
||||||
connect(&m_pro[0], SIGNAL(readyReadStandardOutput()), this, SLOT());
|
// connect(&m_pro[0], SIGNAL(readyReadStandardOutput()), this, SLOT());
|
||||||
}
|
}
|
||||||
|
|
||||||
QString SNaverNewsManage::makeGetListQuery(QString _str,QDate _date,int _nPage)
|
QString SNaverNewsManage::makeGetListQuery(QString _str,QDate _date,int _nPage)
|
||||||
|
|||||||
@@ -10,6 +10,11 @@ using namespace std;
|
|||||||
|
|
||||||
int main(int argc, char *argv[])
|
int main(int argc, char *argv[])
|
||||||
{
|
{
|
||||||
|
// cout << "arguments: ";
|
||||||
|
// for(int i=0; i<argc; i++)
|
||||||
|
// cout << " " << argv[i];
|
||||||
|
// cout << endl;
|
||||||
|
|
||||||
srand(time(0));
|
srand(time(0));
|
||||||
QApplication a(argc, argv);
|
QApplication a(argc, argv);
|
||||||
a.setApplicationName(QString("Chrome"));
|
a.setApplicationName(QString("Chrome"));
|
||||||
@@ -39,5 +44,6 @@ int main(int argc, char *argv[])
|
|||||||
|
|
||||||
process->load(strArgv);
|
process->load(strArgv);
|
||||||
a.exec();
|
a.exec();
|
||||||
|
delete process;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -39,10 +39,17 @@ SCrawler::SCrawler():QObject()
|
|||||||
m_bProcessed = false;
|
m_bProcessed = false;
|
||||||
connect(m_page, SIGNAL(loadFinished(bool)), this, SLOT(saveResult(bool)));
|
connect(m_page, SIGNAL(loadFinished(bool)), this, SLOT(saveResult(bool)));
|
||||||
srand(time(NULL));
|
srand(time(NULL));
|
||||||
|
|
||||||
|
m_pNAM = new QNetworkAccessManager(this);
|
||||||
|
m_page->setNetworkAccessManager(m_pNAM);
|
||||||
}
|
}
|
||||||
|
|
||||||
SCrawler::~SCrawler()
|
SCrawler::~SCrawler()
|
||||||
{
|
{
|
||||||
|
m_page->setNetworkAccessManager(nullptr);
|
||||||
|
|
||||||
|
delete m_pNAM;
|
||||||
|
delete m_page;
|
||||||
}
|
}
|
||||||
|
|
||||||
void SCrawler::load(QStringList _strlistArgv)
|
void SCrawler::load(QStringList _strlistArgv)
|
||||||
@@ -164,17 +171,18 @@ void SCrawler::load(QStringList _strlistArgv)
|
|||||||
url.setScheme("http");
|
url.setScheme("http");
|
||||||
|
|
||||||
m_page->settings()->setAttribute(QWebSettings::AutoLoadImages,false);
|
m_page->settings()->setAttribute(QWebSettings::AutoLoadImages,false);
|
||||||
QNetworkRequest *request = new QNetworkRequest;
|
QNetworkRequest request;
|
||||||
request->setUrl(url);
|
request.setUrl(url);
|
||||||
/*
|
/*
|
||||||
request->setRawHeader("Cache-Control","max-age=0, no-cache");
|
request.setRawHeader("Cache-Control","max-age=0, no-cache");
|
||||||
request->setRawHeader("Pragma","no-cache");
|
request.setRawHeader("Pragma","no-cache");
|
||||||
request->setRawHeader("Expires","Thu, 01 Jan 1970 16:00:00 GMT");
|
request.setRawHeader("Expires","Thu, 01 Jan 1970 16:00:00 GMT");
|
||||||
*/
|
*/
|
||||||
if (m_strReper.isEmpty() == false && m_nSelect == E_NAVER_CAFE_DATA)
|
if (m_strReper.isEmpty() == false && m_nSelect == E_NAVER_CAFE_DATA)
|
||||||
request->setRawHeader("Referer",m_strReper.toLocal8Bit());
|
request.setRawHeader("Referer",m_strReper.toLocal8Bit());
|
||||||
request->setRawHeader("Accept-Language","ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4,zh-CN;q=0.2,zh;q=0.2");
|
request.setRawHeader("Accept-Language","ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4,zh-CN;q=0.2,zh;q=0.2");
|
||||||
m_page->mainFrame()->load(*request);
|
m_page->mainFrame()->load(request);
|
||||||
|
|
||||||
m_bLast = false;
|
m_bLast = false;
|
||||||
m_bError = false;
|
m_bError = false;
|
||||||
}
|
}
|
||||||
@@ -198,6 +206,8 @@ void SCrawler::saveResult(bool ok)
|
|||||||
{
|
{
|
||||||
// qDebug() << "saveResult";
|
// qDebug() << "saveResult";
|
||||||
|
|
||||||
|
// cout << "page data: "<< m_page->bytesReceived() << endl;
|
||||||
|
|
||||||
if (!ok)
|
if (!ok)
|
||||||
{
|
{
|
||||||
cout << "Failed loading";
|
cout << "Failed loading";
|
||||||
@@ -209,14 +219,17 @@ void SCrawler::saveResult(bool ok)
|
|||||||
//qDebug() << "load complete";
|
//qDebug() << "load complete";
|
||||||
switch(m_nSelect)
|
switch(m_nSelect)
|
||||||
{
|
{
|
||||||
case E_NAVER_NEWS_LIST:saveFrameNewsList(m_page->mainFrame());break;
|
case E_NAVER_NEWS_LIST:
|
||||||
|
saveFrameNewsList(m_page->mainFrame());
|
||||||
|
break;
|
||||||
|
|
||||||
case E_NAVER_NEWS_DATA:
|
case E_NAVER_NEWS_DATA:
|
||||||
{
|
{
|
||||||
static bool loaded = false;
|
static bool loaded = false;
|
||||||
if(!loaded)
|
if(!loaded)
|
||||||
{
|
{
|
||||||
loaded = true;
|
loaded = true;
|
||||||
if(!saveFrameNewsUrl(m_page->mainFrame()))
|
if(saveFrameNewsUrl(m_page->mainFrame()) == false)
|
||||||
{
|
{
|
||||||
loaded = false;
|
loaded = false;
|
||||||
return;
|
return;
|
||||||
@@ -229,48 +242,57 @@ void SCrawler::saveResult(bool ok)
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case E_NAVER_NEWS_REPLY:
|
case E_NAVER_NEWS_REPLY:
|
||||||
{
|
|
||||||
if(!saveFrameNewsComment(m_page->mainFrame()))
|
if(!saveFrameNewsComment(m_page->mainFrame()))
|
||||||
return;
|
return;
|
||||||
break;
|
break;
|
||||||
}
|
|
||||||
case E_NAVER_CAFE_LIST:saveFrameCafeList(m_page->mainFrame());break;
|
case E_NAVER_CAFE_LIST:
|
||||||
|
saveFrameCafeList(m_page->mainFrame());
|
||||||
|
break;
|
||||||
|
|
||||||
case E_NAVER_CAFE_DATA:
|
case E_NAVER_CAFE_DATA:
|
||||||
{
|
|
||||||
saveFrameCafeUrl(m_page->mainFrame());
|
saveFrameCafeUrl(m_page->mainFrame());
|
||||||
bodydata.sendDB();
|
bodydata.sendDB();
|
||||||
break;
|
break;
|
||||||
}
|
|
||||||
case E_NAVER_BLOG_LIST:
|
case E_NAVER_BLOG_LIST:
|
||||||
{
|
if(saveFrameList(m_page->mainFrame()) == false)
|
||||||
if(saveFrameList(m_page->mainFrame()))
|
return;
|
||||||
|
|
||||||
break;
|
break;
|
||||||
else
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
case E_NAVER_BLOG_BODY:
|
case E_NAVER_BLOG_BODY:
|
||||||
{
|
if(saveFrameUrl(m_page->mainFrame()) == false)
|
||||||
if(!saveFrameUrl(m_page->mainFrame()))
|
|
||||||
return;
|
return;
|
||||||
|
|
||||||
bodydata.sendDB();
|
bodydata.sendDB();
|
||||||
break;
|
break;
|
||||||
}
|
|
||||||
case E_NAVER_BLOG_REPLY:saveFrameComment(m_page->mainFrame());break;
|
case E_NAVER_BLOG_REPLY:
|
||||||
case E_DAUM_CAFE_LIST:saveFrameDaumCafeList(m_page->mainFrame());break;
|
saveFrameComment(m_page->mainFrame());
|
||||||
|
break;
|
||||||
|
|
||||||
|
case E_DAUM_CAFE_LIST:
|
||||||
|
saveFrameDaumCafeList(m_page->mainFrame());
|
||||||
|
break;
|
||||||
|
|
||||||
case E_DAUM_CAFE_DATA:
|
case E_DAUM_CAFE_DATA:
|
||||||
{
|
|
||||||
saveFrameDaumCafeUrl(m_page->mainFrame());
|
saveFrameDaumCafeUrl(m_page->mainFrame());
|
||||||
bodydata.sendDB();
|
bodydata.sendDB();
|
||||||
break;
|
break;
|
||||||
}
|
|
||||||
case E_DAUM_BLOG_LIST:saveFrameDaumBlogList(m_page->mainFrame());break;
|
case E_DAUM_BLOG_LIST:
|
||||||
|
saveFrameDaumBlogList(m_page->mainFrame());
|
||||||
|
break;
|
||||||
|
|
||||||
case E_DAUM_BLOG_BODY:
|
case E_DAUM_BLOG_BODY:
|
||||||
{
|
|
||||||
saveFrameDaumBlogUrl(m_page->mainFrame());
|
saveFrameDaumBlogUrl(m_page->mainFrame());
|
||||||
bodydata.sendDB();
|
bodydata.sendDB();
|
||||||
break;
|
break;
|
||||||
}
|
|
||||||
case E_DAUM_BLOG_REPLY:saveFrameDaumBlogComment(m_page->mainFrame());break;
|
case E_DAUM_BLOG_REPLY:
|
||||||
|
saveFrameDaumBlogComment(m_page->mainFrame());
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
switch(m_nSelect)
|
switch(m_nSelect)
|
||||||
@@ -296,8 +318,8 @@ void SCrawler::saveResult(bool ok)
|
|||||||
cout << "last";
|
cout << "last";
|
||||||
m_bLast = false;
|
m_bLast = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case E_NAVER_BLOG_REPLY:
|
case E_NAVER_BLOG_REPLY:
|
||||||
case E_NAVER_NEWS_REPLY:
|
case E_NAVER_NEWS_REPLY:
|
||||||
case E_DAUM_BLOG_REPLY:
|
case E_DAUM_BLOG_REPLY:
|
||||||
@@ -327,9 +349,9 @@ void SCrawler::saveResult(bool ok)
|
|||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
qDebug() << "finish";
|
|
||||||
emit finished();
|
|
||||||
|
|
||||||
|
qDebug() << " finish";
|
||||||
|
emit finished();
|
||||||
}
|
}
|
||||||
|
|
||||||
int SCrawler::GetNumber(QString _str)
|
int SCrawler::GetNumber(QString _str)
|
||||||
@@ -407,15 +429,13 @@ void SCrawler::reloadListPage()
|
|||||||
|
|
||||||
bool SCrawler::saveFrameList(QWebFrame *frame)
|
bool SCrawler::saveFrameList(QWebFrame *frame)
|
||||||
{
|
{
|
||||||
|
|
||||||
if (m_bProcessed == false)
|
if (m_bProcessed == false)
|
||||||
m_bProcessed = true;
|
m_bProcessed = true;
|
||||||
else
|
else
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
//qDebug() << frame->documentElement().toPlainText();
|
if (m_bUse == true)
|
||||||
|
return true;
|
||||||
if (m_bUse == true) return true;
|
|
||||||
QWebElement notFound = Find(frame->documentElement(),"div","id","notfound");
|
QWebElement notFound = Find(frame->documentElement(),"div","id","notfound");
|
||||||
if(notFound.isNull() == false)
|
if(notFound.isNull() == false)
|
||||||
{
|
{
|
||||||
@@ -577,7 +597,11 @@ bool SCrawler::saveFrameList(QWebFrame *frame)
|
|||||||
|
|
||||||
{
|
{
|
||||||
QWebElement total = Find(eleMain,"span","class","title_num");
|
QWebElement total = Find(eleMain,"span","class","title_num");
|
||||||
if (total.toPlainText().isEmpty()) {m_bError = true; return true;}
|
if (total.toPlainText().isEmpty())
|
||||||
|
{
|
||||||
|
m_bError = true;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
int nTotal = GetNumber(total.toPlainText().split("/").at(1));
|
int nTotal = GetNumber(total.toPlainText().split("/").at(1));
|
||||||
QStringList strList = m_strUrl.split("&");
|
QStringList strList = m_strUrl.split("&");
|
||||||
bool ok = false;
|
bool ok = false;
|
||||||
@@ -817,7 +841,7 @@ bool SCrawler::saveFrameUrl(QWebFrame *frame)
|
|||||||
|
|
||||||
void SCrawler::reloadPage()
|
void SCrawler::reloadPage()
|
||||||
{
|
{
|
||||||
//qDebug() << "reloadPage called";
|
// qDebug() << "reloadPage called";
|
||||||
saveResult(true);
|
saveResult(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1059,7 +1083,11 @@ void SCrawler::saveFrameCafeList(QWebFrame *frame)
|
|||||||
|
|
||||||
{
|
{
|
||||||
QWebElement total = Find(eleMain,"span","class","title_num");
|
QWebElement total = Find(eleMain,"span","class","title_num");
|
||||||
if (total.toPlainText().isEmpty()) {m_bError = true; return;}
|
if (total.toPlainText().isEmpty())
|
||||||
|
{
|
||||||
|
m_bError = true;
|
||||||
|
return;
|
||||||
|
}
|
||||||
total.toPlainText().split("/").size();
|
total.toPlainText().split("/").size();
|
||||||
int nTotal = GetNumber(total.toPlainText().split("/").at(1));
|
int nTotal = GetNumber(total.toPlainText().split("/").at(1));
|
||||||
QStringList strList = m_strUrl.split("&");
|
QStringList strList = m_strUrl.split("&");
|
||||||
@@ -1377,7 +1405,11 @@ void SCrawler::saveFrameDaumCafeList(QWebFrame *frame)
|
|||||||
b_last = b_last | !(Find(frame->documentElement(), "div", "class", "result_message mg_cont").isNull());
|
b_last = b_last | !(Find(frame->documentElement(), "div", "class", "result_message mg_cont").isNull());
|
||||||
|
|
||||||
QWebElement total = Find(eleMain,"span","class","f_nb f_l");
|
QWebElement total = Find(eleMain,"span","class","f_nb f_l");
|
||||||
if (total.toPlainText().isEmpty()) {m_bError = true; return;}
|
if (total.toPlainText().isEmpty())
|
||||||
|
{
|
||||||
|
m_bError = true;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
QString strTotal = total.toPlainText().split("/").at(1);
|
QString strTotal = total.toPlainText().split("/").at(1);
|
||||||
strTotal = strTotal.replace(",","");
|
strTotal = strTotal.replace(",","");
|
||||||
@@ -1635,47 +1667,66 @@ void SCrawler::saveFrameDaumBlogList(QWebFrame *frame){}
|
|||||||
|
|
||||||
void SCrawler::saveFrameNewsList(QWebFrame *frame)
|
void SCrawler::saveFrameNewsList(QWebFrame *frame)
|
||||||
{
|
{
|
||||||
if (m_bUse == true) return;
|
if (m_bUse == true)
|
||||||
|
return;
|
||||||
|
|
||||||
|
|
||||||
|
// QFile file("pagedata.txt");
|
||||||
|
// if ( file.open(QIODevice::ReadWrite) )
|
||||||
|
// {
|
||||||
|
// QTextStream stream( &file );
|
||||||
|
// stream << frame->documentElement().toOuterXml() << endl;
|
||||||
|
// file.close();
|
||||||
|
// }
|
||||||
|
|
||||||
QWebElement notFound = Find(frame->documentElement(),"div","class","no_content");
|
QWebElement notFound = Find(frame->documentElement(),"div","class","no_content");
|
||||||
if(notFound.isNull() == false)
|
if(notFound.isNull() == false)
|
||||||
{
|
{
|
||||||
m_bLast = true;
|
m_bLast = true;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
QWebElement eleMain = Find(frame->documentElement(),"div","class","srch_result_area headline");
|
QWebElement eleMain = Find(frame->documentElement(),"div","class","srch_result_area headline");
|
||||||
foreach(QWebElement eleSub,eleMain.findAll("div"))
|
foreach(QWebElement eleSub,eleMain.findAll("div"))
|
||||||
{
|
{
|
||||||
if (eleSub.attribute("class") == QString("info"))
|
if (eleSub.attribute("class") == QString("info"))
|
||||||
{
|
{
|
||||||
QString str = Find(eleSub,"a","class","go_naver").attribute("href");
|
QString str = Find(eleSub,"a","class","go_naver").attribute("href");
|
||||||
if (str.trimmed().isEmpty()) continue;
|
if (str.trimmed().isEmpty())
|
||||||
if (str.contains("http://sports")) continue;
|
continue;
|
||||||
|
if (str.contains("http://sports"))
|
||||||
|
continue;
|
||||||
m_bNothing = true;
|
m_bNothing = true;
|
||||||
cout << "o " << str.toStdString() << endl;
|
cout << "o " << str.toStdString() << endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
QStringList strTotal = bodydata.GetNumber(Find(frame->documentElement(),"span","class","result_num").toPlainText().trimmed());
|
// QStringList strTotal = bodydata.GetNumber(Find(frame->documentElement(),"span","class","result_num").toPlainText().trimmed());
|
||||||
QVector <int> vecTotal;
|
QWebElement Total = Find(frame->documentElement(), "div", "class", "title_desc");
|
||||||
foreach(QString str,strTotal)
|
QStringList nums = bodydata.GetNumber(Total.toPlainText());
|
||||||
|
|
||||||
|
if(nums.count() < 3)
|
||||||
{
|
{
|
||||||
if (str.trimmed().isEmpty() == false)
|
m_bError = true;
|
||||||
vecTotal.push_back(str.toInt());
|
m_bUse = true;
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (vecTotal.size() == 3)
|
QVector <int> vecTotal;
|
||||||
{
|
vecTotal.push_back(nums[0].toInt());
|
||||||
if (vecTotal[0] >= vecTotal[1]) m_bLast = true;
|
vecTotal.push_back(nums[1].toInt());
|
||||||
if (vecTotal[1] == vecTotal[2]) m_bLast = true;
|
vecTotal.push_back(nums[2].toInt());
|
||||||
}
|
|
||||||
else
|
if (vecTotal[0] >= vecTotal[1] || vecTotal[1] == vecTotal[2])
|
||||||
m_bError = true;
|
m_bLast = true;
|
||||||
|
|
||||||
m_bUse = true;
|
m_bUse = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
bool SCrawler::saveFrameNewsUrl(QWebFrame *frame)
|
bool SCrawler::saveFrameNewsUrl(QWebFrame *frame)
|
||||||
{
|
{
|
||||||
if (m_bUse) return true;
|
if (m_bUse)
|
||||||
|
return true;
|
||||||
|
|
||||||
{
|
{
|
||||||
QString strQuery = "delete from ";
|
QString strQuery = "delete from ";
|
||||||
@@ -2388,19 +2439,25 @@ bool SCrawler::setProxyFromFile()
|
|||||||
switch(strList.size())
|
switch(strList.size())
|
||||||
{
|
{
|
||||||
case 1:
|
case 1:
|
||||||
|
{
|
||||||
cout << "p : " << strList.at(0).toStdString() << " from File" << endl;
|
cout << "p : " << strList.at(0).toStdString() << " from File" << endl;
|
||||||
|
|
||||||
//manager->setProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0))));
|
//manager->setProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0))));
|
||||||
//m_page->setNetworkAccessManager(manager);
|
//m_page->setNetworkAccessManager(manager);
|
||||||
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0))));
|
QNetworkProxy proxy(QNetworkProxy::HttpProxy, strList.at(0));
|
||||||
|
QNetworkProxy::setApplicationProxy(proxy);
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
case 2:
|
case 2:
|
||||||
|
{
|
||||||
cout << "p : " << strList.at(0).toStdString() << ":" << strList.at(1).toStdString() << " from File" << endl;
|
cout << "p : " << strList.at(0).toStdString() << ":" << strList.at(1).toStdString() << " from File" << endl;
|
||||||
|
|
||||||
//manager->setProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt())));
|
//manager->setProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt())));
|
||||||
//m_page->setNetworkAccessManager(manager);
|
//m_page->setNetworkAccessManager(manager);
|
||||||
|
|
||||||
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt())));
|
QNetworkProxy proxy(QNetworkProxy::HttpProxy, strList.at(0), strList.at(1).toInt());
|
||||||
|
QNetworkProxy::setApplicationProxy(proxy);
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -2439,11 +2496,12 @@ bool SCrawler::setProxyFromDb()
|
|||||||
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0))));
|
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0))));
|
||||||
break;
|
break;
|
||||||
case 2:
|
case 2:
|
||||||
cout << "p : " << strList.at(0).toStdString() << ":" << strList.at(1).toStdString() << " from DB" << endl;
|
|
||||||
m_strProxyIP = strList.at(0);
|
m_strProxyIP = strList.at(0);
|
||||||
m_nProxyPort = strList.at(1).toInt();
|
m_nProxyPort = strList.at(1).toInt();
|
||||||
|
|
||||||
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt())));
|
cout << "p : " << m_strProxyIP.toStdString() << ":" << m_nProxyPort << " from DB" << endl;
|
||||||
|
|
||||||
|
//QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy, m_strProxyIP, m_nProxyPort)));
|
||||||
/*
|
/*
|
||||||
QString strProxyHost = "61.103.7.74";
|
QString strProxyHost = "61.103.7.74";
|
||||||
int nPort = 2074;
|
int nPort = 2074;
|
||||||
@@ -2468,7 +2526,6 @@ bool SCrawler::setProxyFromDb()
|
|||||||
void SCrawler::setProxy()
|
void SCrawler::setProxy()
|
||||||
{
|
{
|
||||||
bool ok = setProxyFromFile() || setProxyFromDb();
|
bool ok = setProxyFromFile() || setProxyFromDb();
|
||||||
//bool ok = false;
|
|
||||||
if (!ok)
|
if (!ok)
|
||||||
cout << "No Proxy" << endl;
|
cout << "No Proxy" << endl;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,6 +6,7 @@
|
|||||||
class SCrawler : public QObject
|
class SCrawler : public QObject
|
||||||
{
|
{
|
||||||
Q_OBJECT
|
Q_OBJECT
|
||||||
|
|
||||||
public:
|
public:
|
||||||
enum E_SELECT
|
enum E_SELECT
|
||||||
{
|
{
|
||||||
@@ -25,7 +26,7 @@ public:
|
|||||||
};
|
};
|
||||||
public:
|
public:
|
||||||
SCrawler();
|
SCrawler();
|
||||||
~SCrawler();
|
virtual ~SCrawler();
|
||||||
void load(QStringList _strlistArgv);
|
void load(QStringList _strlistArgv);
|
||||||
void saveFile();
|
void saveFile();
|
||||||
static void Debug(QString _strFilename,QString _strData);
|
static void Debug(QString _strFilename,QString _strData);
|
||||||
@@ -35,6 +36,7 @@ private slots:
|
|||||||
void saveResult(bool ok);
|
void saveResult(bool ok);
|
||||||
void reloadPage();
|
void reloadPage();
|
||||||
void reloadListPage();
|
void reloadListPage();
|
||||||
|
|
||||||
private:
|
private:
|
||||||
int m_nSelect;
|
int m_nSelect;
|
||||||
QString m_strReper;
|
QString m_strReper;
|
||||||
@@ -43,6 +45,7 @@ private:
|
|||||||
SCrawlerData bodydata;
|
SCrawlerData bodydata;
|
||||||
|
|
||||||
QWebPage *m_page;
|
QWebPage *m_page;
|
||||||
|
QNetworkAccessManager* m_pNAM;
|
||||||
QString m_strFile;
|
QString m_strFile;
|
||||||
QString m_strUrl;
|
QString m_strUrl;
|
||||||
QString m_strTable;
|
QString m_strTable;
|
||||||
|
|||||||
@@ -136,6 +136,9 @@ QStringList SCrawlerData::GetNumber(QString _str)
|
|||||||
{
|
{
|
||||||
if (pch[i].isNumber() || pch[i].isSpace())
|
if (pch[i].isNumber() || pch[i].isSpace())
|
||||||
str += pch[i];
|
str += pch[i];
|
||||||
|
else if(pch[i] != ',' && pch[i] != '.')
|
||||||
|
str += ' ';
|
||||||
}
|
}
|
||||||
return str.trimmed().split(" ");
|
|
||||||
|
return str.trimmed().split(" ", QString::SkipEmptyParts);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -25,9 +25,13 @@ from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
|||||||
|
|
||||||
|
|
||||||
def is_debugger_attached():
|
def is_debugger_attached():
|
||||||
|
try:
|
||||||
for frame in inspect.stack():
|
for frame in inspect.stack():
|
||||||
if frame[1].endswith("pydevd.py"):
|
if frame[1].endswith("pydevd.py"):
|
||||||
return True
|
return True
|
||||||
|
except:
|
||||||
|
return False
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
is_debug = is_debugger_attached()
|
is_debug = is_debugger_attached()
|
||||||
|
|||||||
@@ -103,32 +103,27 @@ class Proxy2Handler:
|
|||||||
|
|
||||||
def lock_enter(self):
|
def lock_enter(self):
|
||||||
# logger.log('lock {}'.format(threading.current_thread().ident))
|
# logger.log('lock {}'.format(threading.current_thread().ident))
|
||||||
# self.lock.acquire()
|
self.lock.acquire()
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def lock_leave(self):
|
def lock_leave(self):
|
||||||
# self.lock.release()
|
self.lock.release()
|
||||||
# logger.log('unlock {}'.format(threading.current_thread().ident))
|
# logger.log('unlock {}'.format(threading.current_thread().ident))
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def commit(self):
|
def commit(self):
|
||||||
self.lock_enter()
|
|
||||||
# self.session.commit()
|
# self.session.commit()
|
||||||
self.lock_leave()
|
pass
|
||||||
|
|
||||||
def get_oldest(self, platform):
|
def get_oldest(self, platform):
|
||||||
self.lock_enter()
|
|
||||||
instance = self.session.query(Proxy2Model).order_by(self.block_field_map[platform].desc()).first()
|
instance = self.session.query(Proxy2Model).order_by(self.block_field_map[platform].desc()).first()
|
||||||
self.lock_leave()
|
|
||||||
return instance
|
return instance
|
||||||
|
|
||||||
def get_query(self, ip, port):
|
def get_query(self, ip, port):
|
||||||
return self.session.query(Proxy2Model).filter_by(ip=ip).filter_by(port=port)
|
return self.session.query(Proxy2Model).filter_by(ip=ip).filter_by(port=port)
|
||||||
|
|
||||||
def get_instance(self, ip, port):
|
def get_instance(self, ip, port):
|
||||||
self.lock_enter()
|
|
||||||
instance = self.get_query(ip, port).first()
|
instance = self.get_query(ip, port).first()
|
||||||
self.lock_leave()
|
|
||||||
return instance
|
return instance
|
||||||
|
|
||||||
def check_all_proxies(self, platform):
|
def check_all_proxies(self, platform):
|
||||||
@@ -161,7 +156,7 @@ class Proxy2Handler:
|
|||||||
if resp.ok:
|
if resp.ok:
|
||||||
instance.set_block_at(platform, None)
|
instance.set_block_at(platform, None)
|
||||||
alive_cnt += 1
|
alive_cnt += 1
|
||||||
print('proxy {}:{} alive'.format(instance.ip, instance.port))
|
# print('proxy {}:{} alive'.format(instance.ip, instance.port))
|
||||||
else:
|
else:
|
||||||
instance.set_block_at(platform, datetime.datetime.now())
|
instance.set_block_at(platform, datetime.datetime.now())
|
||||||
|
|
||||||
@@ -171,34 +166,12 @@ class Proxy2Handler:
|
|||||||
def get(self, platform, proc_id=-1):
|
def get(self, platform, proc_id=-1):
|
||||||
self.lock_enter()
|
self.lock_enter()
|
||||||
|
|
||||||
try:
|
|
||||||
|
|
||||||
block_column = self.block_field_map[platform]
|
block_column = self.block_field_map[platform]
|
||||||
try:
|
|
||||||
instances = self.session.query(Proxy2Model).filter(block_column == None).limit(32).all()
|
instances = self.session.query(Proxy2Model).filter(block_column == None).limit(32).all()
|
||||||
except Exception as e:
|
|
||||||
dbg.print_exception()
|
|
||||||
assert True
|
|
||||||
|
|
||||||
self.lock_leave()
|
|
||||||
|
|
||||||
# try:
|
|
||||||
# session_factory = sqlalchemy.orm.sessionmaker(bind=self.engine)
|
|
||||||
# self.session = sqlalchemy.orm.scoped_session(session_factory)
|
|
||||||
# logger.log('{} session recreate'.format(proc_id))
|
|
||||||
#
|
|
||||||
# except Exception as e2:
|
|
||||||
# dbg.print_exception(e2)
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
instance = None
|
|
||||||
if len(instances) > 0:
|
|
||||||
instance = instances[random.randint(0, len(instances)-1)] if len(instances) > 0 else None
|
instance = instances[random.randint(0, len(instances)-1)] if len(instances) > 0 else None
|
||||||
|
proxy = None
|
||||||
if instance:
|
if instance:
|
||||||
self.lock_leave()
|
proxy = instance.get_instance_for_http()
|
||||||
return instance.get_instance_for_http()
|
|
||||||
else:
|
else:
|
||||||
cnt = self.check_all_proxies(platform)
|
cnt = self.check_all_proxies(platform)
|
||||||
if cnt <= 0:
|
if cnt <= 0:
|
||||||
@@ -206,47 +179,27 @@ class Proxy2Handler:
|
|||||||
self.insert_all(proxies)
|
self.insert_all(proxies)
|
||||||
|
|
||||||
self.lock_leave()
|
self.lock_leave()
|
||||||
return self.get(platform, proc_id)
|
return proxy
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
dbg.print_exception(e)
|
|
||||||
|
|
||||||
def insert(self, ip, port):
|
def insert(self, ip, port):
|
||||||
instance = self.get_instance(ip, port)
|
instance = self.get_instance(ip, port)
|
||||||
if not instance:
|
if not instance:
|
||||||
proxy = Proxy2Model(ip, port)
|
proxy = Proxy2Model(ip, port)
|
||||||
self.lock_enter()
|
|
||||||
self.session.add(proxy)
|
self.session.add(proxy)
|
||||||
self.lock_leave()
|
self.commit()
|
||||||
|
|
||||||
def insert_all(self, proxies):
|
def insert_all(self, proxies):
|
||||||
|
print('{} proxy insert start'.format(len(proxies)))
|
||||||
# INSERT INTO proxy2(ip, PORT)
|
# INSERT INTO proxy2(ip, PORT)
|
||||||
# SELECT <ip>, <port> FROM DUAL
|
# SELECT <ip>, <port> FROM DUAL
|
||||||
# WHERE NOT EXISTS (SELECT * FROM proxy2 WHERE ip=<ip> AND PORT=<port>)
|
# WHERE NOT EXISTS (SELECT * FROM proxy2 WHERE ip=<ip> AND PORT=<port>)
|
||||||
self.lock.acquire()
|
|
||||||
for proxy in proxies:
|
for proxy in proxies:
|
||||||
query = r"INSERT INTO proxy2(ip, PORT) " \
|
query = r"INSERT INTO proxy2(ip, PORT) " \
|
||||||
r"SELECT '{}', {} FROM DUAL " \
|
r"SELECT '{}', {} FROM DUAL " \
|
||||||
r"WHERE NOT EXISTS (SELECT * FROM proxy2 WHERE ip='{}' AND PORT={})"\
|
r"WHERE NOT EXISTS (SELECT * FROM proxy2 WHERE ip='{}' AND PORT={})"\
|
||||||
.format(proxy['ip'], proxy['port'], proxy['ip'], proxy['port'])
|
.format(proxy['ip'], proxy['port'], proxy['ip'], proxy['port'])
|
||||||
# 안됨 - 중복으로 들어감, 쓰레드 종료됨
|
|
||||||
self.engine.execute(query)
|
self.engine.execute(query)
|
||||||
self.lock.release()
|
print('{} proxy insert end'.format(len(proxies)))
|
||||||
|
|
||||||
# self.query(Proxy2Model).insert()
|
|
||||||
#
|
|
||||||
# self.query(Proxy2Model).filter(Proxy2Model.ip == proxy['ip']).filter(Proxy2Model.port == proxy['port']).\
|
|
||||||
# filter(
|
|
||||||
# ~sqlalchemy.exists().where(
|
|
||||||
# sqlalchemy.and_(
|
|
||||||
# Proxy2Model.kw_id == Proxy2Model.kw_id,
|
|
||||||
# Proxy2Model.checkpoint_id == Proxy2Model.id
|
|
||||||
# )
|
|
||||||
# )
|
|
||||||
# )
|
|
||||||
#
|
|
||||||
# if self.session.query(Proxy2Model).filter_by(ip=proxy['ip']).filter_by(port=proxy['port']).count() == 0:
|
|
||||||
# self.session.add(Proxy2Model(proxy['ip'], proxy['port']))
|
|
||||||
|
|
||||||
def set_proxy_blocked(self, ip, port, platform):
|
def set_proxy_blocked(self, ip, port, platform):
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -86,13 +86,14 @@ def check_proxy(qu, proxy, url):
|
|||||||
|
|
||||||
|
|
||||||
def crawl_proxies(check_url=None):
|
def crawl_proxies(check_url=None):
|
||||||
# print('proxy crawling start')
|
print('proxy crawling start')
|
||||||
proxies = get_proxies_free_proxy()
|
proxies = []
|
||||||
|
proxies += get_proxies_free_proxy()
|
||||||
proxies += get_proxies_proxy_searcher()
|
proxies += get_proxies_proxy_searcher()
|
||||||
# proxies += get_proxies_nntime()
|
# proxies += get_proxies_nntime()
|
||||||
# proxies = list(set(proxies))
|
# proxies = list(set(proxies))
|
||||||
# print('proxy crawled {}'.format(len(proxies)))
|
|
||||||
|
|
||||||
|
proxies_alive = []
|
||||||
if check_url:
|
if check_url:
|
||||||
qu = queue.Queue()
|
qu = queue.Queue()
|
||||||
threads = []
|
threads = []
|
||||||
@@ -103,7 +104,6 @@ def crawl_proxies(check_url=None):
|
|||||||
[th.start() for th in threads]
|
[th.start() for th in threads]
|
||||||
[th.join() for th in threads]
|
[th.join() for th in threads]
|
||||||
|
|
||||||
proxies_alive = []
|
|
||||||
while not qu.empty():
|
while not qu.empty():
|
||||||
proxy = qu.get()
|
proxy = qu.get()
|
||||||
proxies_alive.append(proxy)
|
proxies_alive.append(proxy)
|
||||||
@@ -111,21 +111,9 @@ def crawl_proxies(check_url=None):
|
|||||||
else:
|
else:
|
||||||
proxies_alive = proxies
|
proxies_alive = proxies
|
||||||
|
|
||||||
# print('proxy crawling end')
|
print('proxy crawled {}'.format(len(proxies_alive)))
|
||||||
return proxies_alive
|
return proxies_alive
|
||||||
|
|
||||||
# proxies_alive.sort()
|
|
||||||
# print('proxy crawler got {} proxies'.format(len(proxies_alive)))
|
|
||||||
#
|
|
||||||
# with open('proxy.txt', 'w') as f:
|
|
||||||
# print('proxy crawler dump start')
|
|
||||||
# for proxy in proxies_alive:
|
|
||||||
# # print(proxy)
|
|
||||||
# f.write(proxy + '\n')
|
|
||||||
# print('proxy crawler dump end')
|
|
||||||
#
|
|
||||||
# print('proxy crawling end')
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
|||||||
@@ -236,7 +236,7 @@ def make_list_instance(url, proxies=None):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
# @instance_wrapper
|
@instance_wrapper
|
||||||
def make_content_instance(url, proxies=None):
|
def make_content_instance(url, proxies=None):
|
||||||
try:
|
try:
|
||||||
content = InstaContent(url, {}, url, proxies)
|
content = InstaContent(url, {}, url, proxies)
|
||||||
@@ -265,7 +265,7 @@ def ajax_wrapper(func):
|
|||||||
return retry_ajax_load
|
return retry_ajax_load
|
||||||
|
|
||||||
|
|
||||||
# @ajax_wrapper
|
@ajax_wrapper
|
||||||
def load_ajax_list(ins):
|
def load_ajax_list(ins):
|
||||||
try:
|
try:
|
||||||
insta_list = ins.load_more()
|
insta_list = ins.load_more()
|
||||||
@@ -280,7 +280,7 @@ def load_ajax_list(ins):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
# @ajax_wrapper
|
@ajax_wrapper
|
||||||
def load_ajax_reply(ins):
|
def load_ajax_reply(ins):
|
||||||
try:
|
try:
|
||||||
replies = ins.load_reply_more()
|
replies = ins.load_reply_more()
|
||||||
@@ -978,8 +978,9 @@ class InstaAlgorithmMulti(InstaAlgorithm):
|
|||||||
self.total_num += 1
|
self.total_num += 1
|
||||||
if self.is_until_page():
|
if self.is_until_page():
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# if self.list_crawl:
|
# if self.list_crawl:
|
||||||
# printl("Number of Lists = {0}".format(len(self.list_crawl)))
|
# printl("Number of Lists = {0}".format(self.list_crawl.qsize()))
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def crawl(self):
|
def crawl(self):
|
||||||
|
|||||||
@@ -227,7 +227,7 @@ class TwitterCrawler:
|
|||||||
for container_tags in reply_container_tags:
|
for container_tags in reply_container_tags:
|
||||||
tweet_tags = container_tags.select('div.tweet')
|
tweet_tags = container_tags.select('div.tweet')
|
||||||
if len(tweet_tags) > 0:
|
if len(tweet_tags) > 0:
|
||||||
tweet = TweetParser.parse(tweet_tags[0], self.default_config.keyword_id, parent_tw.depth+1, top_tw)
|
tweet = TweetParser.parse(tweet_tags[0], self.default_config.keyword_id, parent_tw.depth+1, parent_tw, top_tw)
|
||||||
# print('[{}]>>> {} {}: {} ({}) ({})'.format(proc_id, tweet.created_at, tweet.user_name, tweet.text[:20], tweet.depth, tweet.tweet_link))
|
# print('[{}]>>> {} {}: {} ({}) ({})'.format(proc_id, tweet.created_at, tweet.user_name, tweet.text[:20], tweet.depth, tweet.tweet_link))
|
||||||
print('[{}] reply {} [{}]'.format(proc_id, tweet.top_link, 'ok'))
|
print('[{}] reply {} [{}]'.format(proc_id, tweet.top_link, 'ok'))
|
||||||
self.insert_content_pool(proc_id, content_queue, tweet, top_tw)
|
self.insert_content_pool(proc_id, content_queue, tweet, top_tw)
|
||||||
|
|||||||
@@ -5,10 +5,11 @@ import bs4
|
|||||||
import datetime
|
import datetime
|
||||||
import pytz
|
import pytz
|
||||||
|
|
||||||
|
|
||||||
class TweetParser:
|
class TweetParser:
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def parse(tag, keyword_id, depth=0, top_tw: Tweet=None):
|
def parse(tag, keyword_id, depth=0, parent_tw: Tweet=None, top_tw: Tweet=None):
|
||||||
tweet = Tweet()
|
tweet = Tweet()
|
||||||
|
|
||||||
tweet.tweet_id = int(tag.attrs['data-tweet-id'])
|
tweet.tweet_id = int(tag.attrs['data-tweet-id'])
|
||||||
@@ -62,7 +63,7 @@ class TweetParser:
|
|||||||
tweet.platform_form = 'post'
|
tweet.platform_form = 'post'
|
||||||
tweet.platform_title = top_tw.user_id if top_tw else tweet.user_id
|
tweet.platform_title = top_tw.user_id if top_tw else tweet.user_id
|
||||||
tweet.article_form = 'body' if tweet.depth is 0 else 'reply'
|
tweet.article_form = 'body' if tweet.depth is 0 else 'reply'
|
||||||
# tweet.article_parent = None
|
tweet.article_parent = parent_tw.user_name if parent_tw else None
|
||||||
tweet.article_id = tweet.user_id
|
tweet.article_id = tweet.user_id
|
||||||
tweet.article_nickname = tweet.user_name
|
tweet.article_nickname = tweet.user_name
|
||||||
# tweet.article_title = None
|
# tweet.article_title = None
|
||||||
|
|||||||
@@ -86,6 +86,8 @@ if __name__ == '__main__':
|
|||||||
sys.argv[5] until_page
|
sys.argv[5] until_page
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
print("arguments: {}".format(' '.join(sys.argv)))
|
||||||
|
|
||||||
if len(sys.argv) == 6:
|
if len(sys.argv) == 6:
|
||||||
print_and_flush("Python Crawling Executed")
|
print_and_flush("Python Crawling Executed")
|
||||||
else:
|
else:
|
||||||
|
|||||||
Reference in New Issue
Block a user