gatherproxy.com 사이트 크롤링 수정

git-svn-id: svn://192.168.0.12/source@269 8346c931-da38-4b9b-9d4c-e48b93cbd075
This commit is contained in:
admin
2016-05-30 02:15:02 +00:00
parent e1c6d46828
commit 8a2b90eb5c
3 changed files with 81 additions and 56 deletions

View File

@@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE QtCreatorProject> <!DOCTYPE QtCreatorProject>
<!-- Written by QtCreator 3.3.0, 2015-10-15T15:42:58. --> <!-- Written by QtCreator 3.3.0, 2016-05-27T16:51:29. -->
<qtcreator> <qtcreator>
<data> <data>
<variable>EnvironmentId</variable> <variable>EnvironmentId</variable>
@@ -227,7 +227,7 @@
<value type="QString" key="ProjectExplorer.ProjectConfiguration.DefaultDisplayName">ProxyProcess</value> <value type="QString" key="ProjectExplorer.ProjectConfiguration.DefaultDisplayName">ProxyProcess</value>
<value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName"></value> <value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName"></value>
<value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">Qt4ProjectManager.Qt4RunConfiguration:C:/source/ProxyProcess/ProxyProcess.pro</value> <value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">Qt4ProjectManager.Qt4RunConfiguration:C:/source/ProxyProcess/ProxyProcess.pro</value>
<value type="QString" key="Qt4ProjectManager.Qt4RunConfiguration.CommandLineArguments"></value> <value type="QString" key="Qt4ProjectManager.Qt4RunConfiguration.CommandLineArguments">&quot;http://www.gatherproxy.com/proxylist/anonymity/?t=Elite&quot; &quot;c:\data\proxytest.txt&quot;</value>
<value type="QString" key="Qt4ProjectManager.Qt4RunConfiguration.ProFile">ProxyProcess.pro</value> <value type="QString" key="Qt4ProjectManager.Qt4RunConfiguration.ProFile">ProxyProcess.pro</value>
<value type="bool" key="Qt4ProjectManager.Qt4RunConfiguration.UseDyldImageSuffix">false</value> <value type="bool" key="Qt4ProjectManager.Qt4RunConfiguration.UseDyldImageSuffix">false</value>
<value type="bool" key="Qt4ProjectManager.Qt4RunConfiguration.UseTerminal">true</value> <value type="bool" key="Qt4ProjectManager.Qt4RunConfiguration.UseTerminal">true</value>

View File

@@ -14,14 +14,12 @@ struct SProxyList
int m_nPort; int m_nPort;
}; };
SCrawler::SCrawler():QObject() SCrawler::SCrawler():QObject(), m_bDone(false), m_bCrawled(false)
{ {
m_page = new QWebPage; m_page = new QWebPage;
connect(m_page, SIGNAL(loadFinished(bool)), this, SLOT(saveResult(bool))); connect(m_page, SIGNAL(loadFinished(bool)), this, SLOT(saveResult(bool)));
// p_timer = new QTimer(this); // p_timer = new QTimer(this);
// connect(p_timer,SIGNAL(timeout()), this, SLOT(killProcess())); // connect(p_timer,SIGNAL(timeout()), this, SLOT(killProcess()));
QTimer::singleShot(60000, this, SLOT(killProcess()));
m_bCrawled = false;
} }
@@ -32,9 +30,16 @@ SCrawler::~SCrawler()
void SCrawler::load(QStringList _strlistArgv) void SCrawler::load(QStringList _strlistArgv)
{ {
QUrl url(_strlistArgv.at(0)); QUrl url(_strlistArgv.at(0));
QNetworkRequest *request = new QNetworkRequest; //QNetworkRequest *request = new QNetworkRequest;
m_strUrl = _strlistArgv.at(0); m_strUrl = _strlistArgv.at(0);
if (m_strUrl.contains("gatherproxy.com"))
QTimer::singleShot(600000, this, SLOT(killProcess()));
else
QTimer::singleShot(60000, this, SLOT(killProcess()));
m_page->settings()->setAttribute(QWebSettings::AutoLoadImages, true); m_page->settings()->setAttribute(QWebSettings::AutoLoadImages, true);
m_page->settings()->setAttribute(QWebSettings::JavaEnabled, true); m_page->settings()->setAttribute(QWebSettings::JavaEnabled, true);
@@ -198,10 +203,23 @@ void SCrawler::saveResult(bool ok)
} }
else if(m_strUrl.contains("gatherproxy.com")) else if(m_strUrl.contains("gatherproxy.com"))
{ {
strIpList = getIpListFromGatherProxy(); if (!m_bDone)
{
if (!m_bCrawled)
{
m_bCrawled = true;
getIpListFromGatherProxy();
m_bCrawled = false;
}
return;
}
else
{
strIpList = m_strIpList.trimmed();
if(strIpList.trimmed().size() > 0 ) if(strIpList.trimmed().size() > 0 )
strIpList = addSource(strIpList, "gatherproxy.com"); strIpList = addSource(strIpList, "gatherproxy.com");
} }
}
else if(m_strUrl.contains("wait3")) else if(m_strUrl.contains("wait3"))
{ {
@@ -714,67 +732,67 @@ QString SCrawler::getIpListFromCoolProxy(const QWebElement _FindElement)
QString SCrawler::getIpListFromGatherProxy() void SCrawler::getIpListFromGatherProxy()
{ {
static bool b_first = true;
QString totalResult; QString totalResult;
QTcpSocket socket; if (b_first)
socket.connectToHost("65.50.243.103",80);
if(!socket.waitForConnected())
{ {
qDebug() << "Error: " << socket.errorString(); b_first = false;
QWebElement button = Find(m_page->mainFrame()->documentElement(), "input", "class", "button");
button.evaluateJavaScript("var evObj = new Event('click', {bubbles: true, cancelable: true, view: window}); this.dispatchEvent(evObj);");
return;
} }
QString index = m_strUrl.right(2); QWebElement webTable = Find(m_page->mainFrame()->documentElement(), "table", "id", "tblproxy");
QRegExp re("(\\d+)"); if (webTable.isNull())
int pos = 0;
QString num;
while((pos = re.indexIn(index, pos)) != -1)
{ {
num = re.cap(1); m_bDone = true;
pos += re.matchedLength(); QTimer::singleShot(1000, this, SLOT(saveResultManual()));
return;
} }
QString strheader = "POST /proxylist/anonymity/?t=Elite HTTP/1.1\r\n" QWebElementCollection trs = webTable.findAll("tr");
"Host: www.gatherproxy.com\r\n" if (trs.count() > 2)
"Connection: keep-alive\r\n"
"Content-Length: " + QString::number(28+num.length()) + "\r\n"
"Cache-Control: max-age=0\r\n"
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8\r\n"
"Origin: http://www.gatherproxy.com\r\n"
"User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.132 Safari/537.36\r\n"
"Content-Type: application/x-www-form-urlencoded\r\n"
"Referer: http://www.gatherproxy.com/proxylist/anonymity/?t=Elite\r\n"
"Accept-Encoding: deflate\r\n"
"Accept-Language: ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4\r\n\r\n"
"Type=elite&PageIdx=" + num + "&Uptime=0";
socket.write(strheader.toUtf8());
QString strPacket;
while (socket.waitForReadyRead())
{ {
strPacket += QString::fromUtf8(socket.readAll()); for (int i = 2; i < trs.count(); i++)
}
//Debug("c:/data/asdf.html", strPacket);
{ {
int pos = 0; QWebElementCollection tds = trs.at(i).findAll("td");
QRegExp re("<td><script>document\\.write\\('(\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})'\\)</script></td>\\s*<td><script>document\\.write\\(gp\\.dep\\('([A-Fa-f0-9]{2,4})'\\)\\)</script>"); if (tds.count() > 2)
while((pos = re.indexIn(strPacket, pos)) != -1)
{
QString ip = re.cap(1);
QString port = QString::number(getPort(re.cap(2)));
if(!ip.isNull() && !port.isNull())
{ {
QString ip = tds.at(1).toPlainText();
QString port = tds.at(2).toPlainText();
totalResult += (ip + "," + port + "\n"); totalResult += (ip + "," + port + "\n");
} }
}
pos += re.matchedLength();
} }
m_strIpList += totalResult;
QWebElement webPageNavi = Find(m_page->mainFrame()->documentElement(), "div", "class", "pagenavi");
if (webPageNavi.isNull())
{
m_bDone = true;
QTimer::singleShot(1000, this, SLOT(saveResultManual()));
return;
} }
return totalResult.trimmed(); QWebElement span = webPageNavi.findFirst("span");
QWebElement webA = span.nextSibling();
if (webA.isNull())
{
m_bDone = true;
QTimer::singleShot(1000, this, SLOT(saveResultManual()));
return;
}
else
{
webA.evaluateJavaScript("var evObj = new Event('click', {bubbles: true, cancelable: true, view: window}); this.dispatchEvent(evObj);");
return;
}
}
void SCrawler::saveResultManual()
{
saveResult(true);
} }
bool SCrawler::SendIpList(QString _strIpList) bool SCrawler::SendIpList(QString _strIpList)
{ {
@@ -850,7 +868,10 @@ QList<QWebElement> SCrawler::FindAllMid(const QWebElement _FindElement,const QSt
void SCrawler::killProcess() void SCrawler::killProcess()
{ {
if (m_strIpList.isEmpty())
cout << endl << "timeout"; cout << endl << "timeout";
else
cout << m_strIpList.toStdString() << "ok";
emit finished(); emit finished();
} }

View File

@@ -20,6 +20,7 @@ signals:
void finished(); void finished();
private slots: private slots:
void saveResult(bool ok); void saveResult(bool ok);
void saveResultManual();
void killProcess(); void killProcess();
private: private:
int m_nSelect; int m_nSelect;
@@ -34,6 +35,9 @@ private:
QString m_strFolder; QString m_strFolder;
QString m_strLocation; QString m_strLocation;
QString m_strIpList;
bool m_bDone;
bool m_bUse; bool m_bUse;
bool m_bLast; bool m_bLast;
bool m_bError; bool m_bError;
@@ -58,7 +62,7 @@ public:
QString getIpListFromFreeproxy(const QWebElement _FindElement); QString getIpListFromFreeproxy(const QWebElement _FindElement);
QString getIpListFromXroxy(const QWebElement _FindElement); QString getIpListFromXroxy(const QWebElement _FindElement);
QString getIpListFromCoolProxy(const QWebElement _FindElement); QString getIpListFromCoolProxy(const QWebElement _FindElement);
QString getIpListFromGatherProxy(); void getIpListFromGatherProxy();
QString addSource(QString _strIpList, QString _strSource); QString addSource(QString _strIpList, QString _strSource);