From 17eb8b75caf530ba78a806ebada4e5226432833e Mon Sep 17 00:00:00 2001 From: admin Date: Thu, 9 Jul 2015 09:30:40 +0000 Subject: [PATCH] =?UTF-8?q?=ED=94=84=EB=A1=9D=EC=8B=9C=20=EC=82=AC?= =?UTF-8?q?=EC=9D=B4=ED=8A=B8=20=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit git-svn-id: svn://192.168.0.12/source@165 8346c931-da38-4b9b-9d4c-e48b93cbd075 --- ProxyProcess/ProxyProcess.pro | 2 +- ProxyProcess/ProxyProcess.pro.user | 4 +- ProxyProcess/scrawler.cpp | 503 ++++++++++++++++++++++++----- ProxyProcess/scrawler.h | 13 + 4 files changed, 432 insertions(+), 90 deletions(-) diff --git a/ProxyProcess/ProxyProcess.pro b/ProxyProcess/ProxyProcess.pro index 4daeffb..a540323 100644 --- a/ProxyProcess/ProxyProcess.pro +++ b/ProxyProcess/ProxyProcess.pro @@ -12,7 +12,7 @@ CONFIG -= app_bundle TEMPLATE = app SOURCES += main.cpp \ - scrawler.cpp \ + scrawler.cpp HEADERS += \ diff --git a/ProxyProcess/ProxyProcess.pro.user b/ProxyProcess/ProxyProcess.pro.user index 8753660..75527d4 100644 --- a/ProxyProcess/ProxyProcess.pro.user +++ b/ProxyProcess/ProxyProcess.pro.user @@ -1,6 +1,6 @@ - + EnvironmentId @@ -227,7 +227,7 @@ ProxyProcess Qt4ProjectManager.Qt4RunConfiguration:C:/source/ProxyProcess/ProxyProcess.pro - "http://free-proxy.cz/en/proxylist/country/all/http/speed/level2" + "http://www.gatherproxy.com/proxylist/anonymity/?t=Elite3" ProxyProcess.pro false true diff --git a/ProxyProcess/scrawler.cpp b/ProxyProcess/scrawler.cpp index aaa8ef8..ff65c0a 100644 --- a/ProxyProcess/scrawler.cpp +++ b/ProxyProcess/scrawler.cpp @@ -4,7 +4,8 @@ #include #include #include - +#include +#include using namespace std; struct SProxyList @@ -19,8 +20,9 @@ SCrawler::SCrawler():QObject() connect(m_page, SIGNAL(loadFinished(bool)), this, SLOT(saveResult(bool))); // p_timer = new QTimer(this); // connect(p_timer,SIGNAL(timeout()), this, SLOT(killProcess())); - QTimer::singleShot(90000, this, SLOT(killProcess())); + QTimer::singleShot(60000, this, SLOT(killProcess())); m_bCrawled = false; + } SCrawler::~SCrawler() @@ -61,9 +63,6 @@ void SCrawler::saveResult(bool ok) return; } - Debug("c:/data/test3.html", m_page->currentFrame()->toHtml()); - - QString strIpList; if(m_strUrl.contains("hidemyass")) { @@ -96,7 +95,7 @@ void SCrawler::saveResult(bool ok) strIpList = addSource(strIpList, "cybersyndrome.net"); } } - else if(m_strUrl.contains("proxylists")) + else if(m_strUrl.contains("proxylists.net")) { QWebElement p_parse = m_page->mainFrame()->findFirstElement("table"); @@ -107,34 +106,125 @@ void SCrawler::saveResult(bool ok) strIpList = addSource(strIpList, "proxylists.net"); } } + else if(m_strUrl.contains("txt.proxyspy.net")) + { + QString p_parse = m_page->mainFrame()->toPlainText(); + if(!p_parse.isEmpty()) + { + strIpList = getIpListFromProxySpy(p_parse); + if(strIpList.trimmed().size() > 0 ) + strIpList = addSource(strIpList, "proxyspy.net"); + } + /* + if(!p_parse.isNull()) + { + strIpList = getIpListFromProxylists(p_parse); + if(strIpList.trimmed().size() > 0 ) + strIpList = addSource(strIpList, "proxylists.net"); + } + */ + } + else if(m_strUrl.contains("proxysearcher.sourceforge.net")) + { + QWebElement p_parse = m_page->mainFrame()->findFirstElement("body"); + if(!p_parse.isNull()) + { + strIpList = getIpListFromProxySearcher(p_parse); + if(strIpList.trimmed().size() > 0 ) + strIpList = addSource(strIpList, "proxysearcher.sourceforge.net"); + } + } + else if(m_strUrl.contains("proxylist.ro")) + { + QWebElement p_parse = m_page->mainFrame()->findFirstElement("body"); + if(!p_parse.isNull()) + { + strIpList = getIpListFromProxyListro(p_parse); + if(strIpList.trimmed().size() > 0 ) + strIpList = addSource(strIpList, "proxylist.ro"); + } + } + + else if(m_strUrl.contains("samair.ru")) + { + QWebElement p_parse = Find(m_page->currentFrame()->documentElement(), "div", "id", "content"); + if(!p_parse.isNull()) + { + strIpList = getIpListFromSamuir(p_parse); + if(strIpList.trimmed().size() > 0 ) + strIpList = addSource(strIpList, "samair.ru"); + } + } + + else if(m_strUrl.contains("nntime.com")) + { + QWebElement p_parse = Find(m_page->currentFrame()->documentElement(), "table", "id", "proxylist"); + if(!p_parse.isNull()) + { + strIpList = getIpListFromNntime(p_parse); + if(strIpList.trimmed().size() > 0 ) + strIpList = addSource(strIpList, "nntime.com"); + } + } + else if(m_strUrl.contains("free-proxy.cz")) + { + QWebElement p_parse = Find(m_page->currentFrame()->documentElement(), "table", "id", "proxy_list"); + if(!p_parse.isNull()) + { + strIpList = getIpListFromFreeproxy(p_parse); + if(strIpList.trimmed().size() > 0 ) + strIpList = addSource(strIpList, "free-proxy.cz"); + } + } + else if(m_strUrl.contains("xroxy.com")) + { + QWebElement p_parse = m_page->currentFrame()->documentElement().findFirst("body"); + if(!p_parse.isNull()) + { + strIpList = getIpListFromXroxy(p_parse); + if(strIpList.trimmed().size() > 0 ) + strIpList = addSource(strIpList, "xroxy.com"); + } + } + else if(m_strUrl.contains("cool-proxy.net")) + { + QWebElement p_parse = Find(m_page->currentFrame()->documentElement(), "div", "id", "main"); + if(!p_parse.isNull()) + { + strIpList = getIpListFromCoolProxy(p_parse); + if(strIpList.trimmed().size() > 0 ) + strIpList = addSource(strIpList, "cool-proxy.net"); + } + } + else if(m_strUrl.contains("gatherproxy.com")) + { + strIpList = getIpListFromGatherProxy(); + if(strIpList.trimmed().size() > 0 ) + strIpList = addSource(strIpList, "gatherproxy.com"); + } + + else if(m_strUrl.contains("wait3")) + { + QThread::sleep(3); + } + else if(m_strUrl.contains("wait5")) + { + QThread::sleep(5); + } + + + QThread::sleep(5); + //qDebug() << strIpList; + //Debug("c:/data/test3.html", m_page->mainFrame()->toHtml()); if(strIpList.trimmed().size() > 0) cout << strIpList.trimmed().toStdString(); - // success to crawling + if(strIpList.size() > 8) { - // in case sending iplist to db - if(m_strLocation.compare("local") != 0) - { - // success to send ip list to db - if(SendIpList(strIpList)) - { - cout << endl << "uok"; - } - // fail to sen ip list to db - else - { - cout << endl << "fok"; - } - } - // in case not sending iplist to db - else - { - cout << endl << "ok"; - } + cout << endl << "ok"; } - // fail to crawling else { cout << "sitedown"; @@ -408,7 +498,284 @@ QString SCrawler::getIpListFromProxylists(const QWebElement _FindElement) return totalResult.trimmed(); } -/* +QString SCrawler::getIpListFromProxySpy(const QString _txt) +{ + QStringList strlist = _txt.split("\n", QString::SkipEmptyParts); + QString totalResult; + //qDebug() << _txt; + QRegExp re("(\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}):(\\d{2,5})\\s*[a-zA-Z]+-(N|A|H)\\S*"); + foreach(QString str, strlist) + { + int pos = 0; + while((pos = re.indexIn(str, pos)) != -1) + { + if((re.cap(3) == "A") || (re.cap(3)) == "H") + totalResult += (re.cap(1) + "," + re.cap(2) + "\n"); + pos += re.matchedLength(); + } + } + return totalResult.trimmed(); +} + +QString SCrawler::getIpListFromProxySearcher(const QWebElement _FindElement) +{ + QString totalResult; + QRegExp re("(\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}):(\\d{2,5})"); + QString str = _FindElement.toPlainText(); + int pos = 0; + while((pos = re.indexIn(str, pos)) != -1) + { + totalResult += (re.cap(1) + "," + re.cap(2) + "\n"); + pos += re.matchedLength(); + } + return totalResult.trimmed(); +} + +QString SCrawler::getIpListFromProxyListro(const QWebElement _FindElement) +{ + QString totalResult; + + QWebElementCollection trs = _FindElement.findAll("tr"); + + foreach(QWebElement tr, trs) + { + QString strclass = tr.attribute("class").trimmed(); + if((strclass.compare("speed1") == 0) || (strclass.compare("speed2") == 0)) + { + QWebElementCollection tds = tr.findAll("td"); + if(tds.count() < 4) + continue; + if((tds.at(3).toPlainText().trimmed() == "Y") || (tds.at(3).toPlainText().trimmed() == "y")) + { + totalResult += tds.at(1).toPlainText().trimmed().replace("\"","").trimmed() + "," + tds.at(2).toPlainText().trimmed().replace("\"","").trimmed() + "\n"; + } + } + } + + return totalResult.trimmed(); +} + +QString SCrawler::getIpListFromSamuir(const QWebElement _FindElement) +{ + QString totalResult; + + QWebElement table = Find(_FindElement, "table", "id", "proxylist"); + + QWebElementCollection trs = table.findAll("tr"); + + foreach(QWebElement tr, trs) + { + QWebElementCollection tds = tr.findAll("td"); + if(tds.count() < 3) + continue; + + if(tds.at(1).toPlainText().contains("anony")) + { + QString temp = tds.at(0).toPlainText().replace("\"","").trimmed(); + totalResult += (temp.replace(":",",") + "\n"); + } + } + + return totalResult.trimmed(); +} + +QString SCrawler::getIpListFromNntime(const QWebElement _FindElement) +{ + QString totalResult; + QWebElementCollection trs = _FindElement.findAll("tr"); + + QRegExp re("(\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}):(\\d{2,5})"); + + foreach(QWebElement tr, trs) + { + QWebElementCollection tds = tr.findAll("td"); + if(tds.count() < 4) + continue; + if(tds.at(2).toPlainText().contains("anony")) + { + int pos = 0; + while((pos = re.indexIn(tds.at(1).toPlainText().replace("\"",""), pos)) != -1) + { + totalResult += (re.cap(1) + "," + re.cap(2) + "\n"); + pos += re.matchedLength(); + } + } + } + + return totalResult.trimmed(); +} + +QString SCrawler::getIpListFromFreeproxy(const QWebElement _FindElement) +{ + QString totalResult; + QWebElementCollection trs = _FindElement.findAll("tr"); + + foreach(QWebElement tr, trs) + { + QWebElementCollection tds = tr.findAll("td"); + if(tds.count() < 7) + continue; + + totalResult += (tds.at(0).toPlainText().trimmed() + "," + tds.at(1).toPlainText().trimmed() + "\n"); + } + + return totalResult.trimmed(); +} + +QString SCrawler::getIpListFromXroxy(const QWebElement _FindElement) +{ + QString totalResult; + QWebElementCollection trs = _FindElement.findAll("tr"); + QRegExp reip("(\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})"); + QRegExp repo("(\\d{2,5})"); + foreach(QWebElement tr, trs) + { + QWebElementCollection tds = tr.findAll("td"); + QString ip; + QString port; + if(tds.count() < 7) + continue; + { + int pos = 0; + while((pos = reip.indexIn(tds.at(1).toPlainText().replace("\"","").trimmed(), pos)) != -1) + { + ip = reip.cap(1); + pos += reip.matchedLength(); + } + } + { + int pos = 0; + while((pos = repo.indexIn(tds.at(2).toPlainText().replace("\"","").trimmed(), pos)) != -1) + { + port = repo.cap(1); + pos += repo.matchedLength(); + } + } + if(!ip.isEmpty() && !port.isEmpty()) + { + totalResult += (ip + "," + port + "\n"); + } + + { + ip.clear(); + port.clear(); + } + + } + return totalResult.trimmed(); +} + + +QString SCrawler::getIpListFromCoolProxy(const QWebElement _FindElement) +{ + QString totalResult; + QWebElementCollection trs = _FindElement.findAll("tr"); + + QRegExp reip("(\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})"); + QRegExp repo("(\\d{2,5})"); + + + foreach(QWebElement tr, trs) + { + QWebElementCollection tds = tr.findAll("td"); + QString ip; + QString port; + if(tds.count() < 7) + continue; + { + int pos = 0; + while((pos = reip.indexIn(tds.at(0).toPlainText().replace("\"","").trimmed(), pos)) != -1) + { + ip = reip.cap(1); + pos += reip.matchedLength(); + } + } + { + int pos = 0; + while((pos = repo.indexIn(tds.at(1).toPlainText().replace("\"","").trimmed(), pos)) != -1) + { + port = repo.cap(1); + pos += repo.matchedLength(); + } + } + if(!ip.isEmpty() && !port.isEmpty()) + { + totalResult += (ip + "," + port + "\n"); + } + + { + ip.clear(); + port.clear(); + } + } + + return totalResult.trimmed(); +} + + + +QString SCrawler::getIpListFromGatherProxy() +{ + QString totalResult; + QTcpSocket socket; + socket.connectToHost("65.50.243.103",80); + if(!socket.waitForConnected()) + { + qDebug() << "Error: " << socket.errorString(); + } + + QString index = m_strUrl.right(2); + QRegExp re("(\\d+)"); + int pos = 0; + QString num; + while((pos = re.indexIn(index, pos)) != -1) + { + num = re.cap(1); + pos += re.matchedLength(); + } + + QString strheader = "POST /proxylist/anonymity/?t=Elite HTTP/1.1\r\n" + "Host: www.gatherproxy.com\r\n" + "Connection: keep-alive\r\n" + "Content-Length: " + QString::number(28+num.length()) + "\r\n" + "Cache-Control: max-age=0\r\n" + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8\r\n" + "Origin: http://www.gatherproxy.com\r\n" + "User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.132 Safari/537.36\r\n" + "Content-Type: application/x-www-form-urlencoded\r\n" + "Referer: http://www.gatherproxy.com/proxylist/anonymity/?t=Elite\r\n" + "Accept-Encoding: deflate\r\n" + "Accept-Language: ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4\r\n\r\n" + "Type=elite&PageIdx=" + num + "&Uptime=0"; + socket.write(strheader.toUtf8()); + + QString strPacket; + while (socket.waitForReadyRead()) + { + strPacket += QString::fromUtf8(socket.readAll()); + } + //Debug("c:/data/asdf.html", strPacket); + { + int pos = 0; + QRegExp re("\\s*"); + while((pos = re.indexIn(strPacket, pos)) != -1) + { + QString ip = re.cap(1); + QString port = QString::number(getPort(re.cap(2))); + + if(!ip.isNull() && !port.isNull()) + { + totalResult += (ip + "," + port + "\n"); + } + + pos += re.matchedLength(); + } + + } + return totalResult.trimmed(); +} + + bool SCrawler::SendIpList(QString _strIpList) { QSqlDatabase db = QSqlDatabase::addDatabase("QMYSQL"); @@ -464,65 +831,6 @@ bool SCrawler::SendIpList(QString _strIpList) } return true; } -*/ - -bool SCrawler::SendIpList(QString _strIpList) -{ - - QSqlDatabase db = QSqlDatabase::addDatabase("QMYSQL"); - db.setHostName("bigbird.iptime.org"); - db.setUserName("admin"); - db.setPassword("admin123"); - db.setDatabaseName("concepters"); - - if (db.open() == false) - { - qDebug() << "DB open Failed in SendIpList()"; - return false; - } - - QSqlQuery sql; - //QString strQuery = "truncate table Proxy"; - /* - QString strQuery = "delete from Proxy"; - QString strUtf8(strQuery.toUtf8()); - - if (sql.exec(strUtf8) == false) - { - p_labelStatus->setText("Delete Query\n Fail"); - return false; - } - */ - QString strQuery; - QString strUtf8; - QStringList _slIpList = _strIpList.split("\n"); - foreach(QString str, _slIpList) - { - strQuery = "insert into Proxy set Proxy='"; - strQuery += str.split(',').at(0).trimmed(); - strQuery += "', Port="; - strQuery += str.split(',').at(1).trimmed(); - if(str.split(',').size() > 2) - { - strQuery += ", Source='"; - strQuery += str.split(',').at(2).trimmed(); - strQuery += "'"; - } - strUtf8 = strQuery.toUtf8(); - if (sql.exec(strUtf8) == false) - { - //InsertLog(sql.lastQuery() + "is Failed"); - //return false; - cerr << sql.lastQuery().toStdString() << endl; - //cout << "PP send ip list failed" << endl; - } - } - //cout << "PP send ip list : " << _slIpList.size() << endl; - db.close(); - return true; -} - - QList SCrawler::FindAllMid(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind, const int _strStart, const int _strLength) { @@ -564,4 +872,25 @@ QString SCrawler::addSource(QString _strIpList, QString _strSource) return straddedList.join("\n").trimmed(); } +int SCrawler::getPort(QString _strport) +{ + int result = 0; + for(int i = 0; i < _strport.length(); i++) + { + char strport = _strport.at(i).toLatin1(); + if( 'a' <= strport && strport <= 'f' ) + { + result += (((int)strport - (int)'a' + 10) << ((_strport.length()-1-i)*4)); + } + else if( 'A' <= strport && strport <= 'F' ) + { + result += (((int)strport - (int)'A' + 10) << ((_strport.length()-1-i)*4)); + } + else if( '0' <= strport && strport <= '9') + { + result += (((int)strport - (int)'0') << ((_strport.length()-1-i)*4)); + } + } + return result; +} diff --git a/ProxyProcess/scrawler.h b/ProxyProcess/scrawler.h index e08f6c3..d575f66 100644 --- a/ProxyProcess/scrawler.h +++ b/ProxyProcess/scrawler.h @@ -4,6 +4,7 @@ #include #include #include +class Client; class SCrawler : public QObject { Q_OBJECT @@ -14,6 +15,7 @@ public: void saveFile(); // static void Debug(QString _strFilename,QString _strData); bool Debug(QString _strFilename,QString _strData); + signals: void finished(); private slots: @@ -41,12 +43,23 @@ public: QString SqlString(QString _str); QString GetSafeUtf(QString _strData); int GetNumber(QString _str); + int getPort(QString _strport); bool SendIpList(QString _str); void SearchChildFrame(QWebFrame *frame); QString getIpListFromAss(const QWebElement _FindElement); QString getIpListFromNordVpn(const QWebElement _FindElement); QString getIpListFromCyberSyndrom(const QWebElement _FindElement); QString getIpListFromProxylists(const QWebElement _FindElement); + QString getIpListFromProxySpy(const QString _txt); + QString getIpListFromProxySearcher(const QWebElement _FindElement); + QString getIpListFromProxyListro(const QWebElement _FindElement); + QString getIpListFromSamuir(const QWebElement _FindElement); + QString getIpListFromNntime(const QWebElement _FindElement); + QString getIpListFromFreeproxy(const QWebElement _FindElement); + QString getIpListFromXroxy(const QWebElement _FindElement); + QString getIpListFromCoolProxy(const QWebElement _FindElement); + QString getIpListFromGatherProxy(); + QString addSource(QString _strIpList, QString _strSource); QWebElement Find(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind);