gatherproxy.com 사이트 크롤링 수정

git-svn-id: svn://192.168.0.12/source@269 8346c931-da38-4b9b-9d4c-e48b93cbd075
This commit is contained in:
admin
2016-05-30 02:15:02 +00:00
parent e1c6d46828
commit 8a2b90eb5c
3 changed files with 81 additions and 56 deletions

View File

@@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE QtCreatorProject>
<!-- Written by QtCreator 3.3.0, 2015-10-15T15:42:58. -->
<!-- Written by QtCreator 3.3.0, 2016-05-27T16:51:29. -->
<qtcreator>
<data>
<variable>EnvironmentId</variable>
@@ -227,7 +227,7 @@
<value type="QString" key="ProjectExplorer.ProjectConfiguration.DefaultDisplayName">ProxyProcess</value>
<value type="QString" key="ProjectExplorer.ProjectConfiguration.DisplayName"></value>
<value type="QString" key="ProjectExplorer.ProjectConfiguration.Id">Qt4ProjectManager.Qt4RunConfiguration:C:/source/ProxyProcess/ProxyProcess.pro</value>
<value type="QString" key="Qt4ProjectManager.Qt4RunConfiguration.CommandLineArguments"></value>
<value type="QString" key="Qt4ProjectManager.Qt4RunConfiguration.CommandLineArguments">&quot;http://www.gatherproxy.com/proxylist/anonymity/?t=Elite&quot; &quot;c:\data\proxytest.txt&quot;</value>
<value type="QString" key="Qt4ProjectManager.Qt4RunConfiguration.ProFile">ProxyProcess.pro</value>
<value type="bool" key="Qt4ProjectManager.Qt4RunConfiguration.UseDyldImageSuffix">false</value>
<value type="bool" key="Qt4ProjectManager.Qt4RunConfiguration.UseTerminal">true</value>

View File

@@ -14,14 +14,12 @@ struct SProxyList
int m_nPort;
};
SCrawler::SCrawler():QObject()
SCrawler::SCrawler():QObject(), m_bDone(false), m_bCrawled(false)
{
m_page = new QWebPage;
connect(m_page, SIGNAL(loadFinished(bool)), this, SLOT(saveResult(bool)));
// p_timer = new QTimer(this);
// connect(p_timer,SIGNAL(timeout()), this, SLOT(killProcess()));
QTimer::singleShot(60000, this, SLOT(killProcess()));
m_bCrawled = false;
}
@@ -32,9 +30,16 @@ SCrawler::~SCrawler()
void SCrawler::load(QStringList _strlistArgv)
{
QUrl url(_strlistArgv.at(0));
QNetworkRequest *request = new QNetworkRequest;
//QNetworkRequest *request = new QNetworkRequest;
m_strUrl = _strlistArgv.at(0);
if (m_strUrl.contains("gatherproxy.com"))
QTimer::singleShot(600000, this, SLOT(killProcess()));
else
QTimer::singleShot(60000, this, SLOT(killProcess()));
m_page->settings()->setAttribute(QWebSettings::AutoLoadImages, true);
m_page->settings()->setAttribute(QWebSettings::JavaEnabled, true);
@@ -198,10 +203,23 @@ void SCrawler::saveResult(bool ok)
}
else if(m_strUrl.contains("gatherproxy.com"))
{
strIpList = getIpListFromGatherProxy();
if (!m_bDone)
{
if (!m_bCrawled)
{
m_bCrawled = true;
getIpListFromGatherProxy();
m_bCrawled = false;
}
return;
}
else
{
strIpList = m_strIpList.trimmed();
if(strIpList.trimmed().size() > 0 )
strIpList = addSource(strIpList, "gatherproxy.com");
}
}
else if(m_strUrl.contains("wait3"))
{
@@ -714,67 +732,67 @@ QString SCrawler::getIpListFromCoolProxy(const QWebElement _FindElement)
QString SCrawler::getIpListFromGatherProxy()
void SCrawler::getIpListFromGatherProxy()
{
static bool b_first = true;
QString totalResult;
QTcpSocket socket;
socket.connectToHost("65.50.243.103",80);
if(!socket.waitForConnected())
if (b_first)
{
qDebug() << "Error: " << socket.errorString();
b_first = false;
QWebElement button = Find(m_page->mainFrame()->documentElement(), "input", "class", "button");
button.evaluateJavaScript("var evObj = new Event('click', {bubbles: true, cancelable: true, view: window}); this.dispatchEvent(evObj);");
return;
}
QString index = m_strUrl.right(2);
QRegExp re("(\\d+)");
int pos = 0;
QString num;
while((pos = re.indexIn(index, pos)) != -1)
QWebElement webTable = Find(m_page->mainFrame()->documentElement(), "table", "id", "tblproxy");
if (webTable.isNull())
{
num = re.cap(1);
pos += re.matchedLength();
m_bDone = true;
QTimer::singleShot(1000, this, SLOT(saveResultManual()));
return;
}
QString strheader = "POST /proxylist/anonymity/?t=Elite HTTP/1.1\r\n"
"Host: www.gatherproxy.com\r\n"
"Connection: keep-alive\r\n"
"Content-Length: " + QString::number(28+num.length()) + "\r\n"
"Cache-Control: max-age=0\r\n"
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8\r\n"
"Origin: http://www.gatherproxy.com\r\n"
"User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.132 Safari/537.36\r\n"
"Content-Type: application/x-www-form-urlencoded\r\n"
"Referer: http://www.gatherproxy.com/proxylist/anonymity/?t=Elite\r\n"
"Accept-Encoding: deflate\r\n"
"Accept-Language: ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4\r\n\r\n"
"Type=elite&PageIdx=" + num + "&Uptime=0";
socket.write(strheader.toUtf8());
QString strPacket;
while (socket.waitForReadyRead())
QWebElementCollection trs = webTable.findAll("tr");
if (trs.count() > 2)
{
strPacket += QString::fromUtf8(socket.readAll());
}
//Debug("c:/data/asdf.html", strPacket);
for (int i = 2; i < trs.count(); i++)
{
int pos = 0;
QRegExp re("<td><script>document\\.write\\('(\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})'\\)</script></td>\\s*<td><script>document\\.write\\(gp\\.dep\\('([A-Fa-f0-9]{2,4})'\\)\\)</script>");
while((pos = re.indexIn(strPacket, pos)) != -1)
{
QString ip = re.cap(1);
QString port = QString::number(getPort(re.cap(2)));
if(!ip.isNull() && !port.isNull())
QWebElementCollection tds = trs.at(i).findAll("td");
if (tds.count() > 2)
{
QString ip = tds.at(1).toPlainText();
QString port = tds.at(2).toPlainText();
totalResult += (ip + "," + port + "\n");
}
pos += re.matchedLength();
}
}
m_strIpList += totalResult;
QWebElement webPageNavi = Find(m_page->mainFrame()->documentElement(), "div", "class", "pagenavi");
if (webPageNavi.isNull())
{
m_bDone = true;
QTimer::singleShot(1000, this, SLOT(saveResultManual()));
return;
}
QWebElement span = webPageNavi.findFirst("span");
QWebElement webA = span.nextSibling();
if (webA.isNull())
{
m_bDone = true;
QTimer::singleShot(1000, this, SLOT(saveResultManual()));
return;
}
else
{
webA.evaluateJavaScript("var evObj = new Event('click', {bubbles: true, cancelable: true, view: window}); this.dispatchEvent(evObj);");
return;
}
return totalResult.trimmed();
}
void SCrawler::saveResultManual()
{
saveResult(true);
}
bool SCrawler::SendIpList(QString _strIpList)
{
@@ -850,7 +868,10 @@ QList<QWebElement> SCrawler::FindAllMid(const QWebElement _FindElement,const QSt
void SCrawler::killProcess()
{
if (m_strIpList.isEmpty())
cout << endl << "timeout";
else
cout << m_strIpList.toStdString() << "ok";
emit finished();
}

View File

@@ -20,6 +20,7 @@ signals:
void finished();
private slots:
void saveResult(bool ok);
void saveResultManual();
void killProcess();
private:
int m_nSelect;
@@ -34,6 +35,9 @@ private:
QString m_strFolder;
QString m_strLocation;
QString m_strIpList;
bool m_bDone;
bool m_bUse;
bool m_bLast;
bool m_bError;
@@ -58,7 +62,7 @@ public:
QString getIpListFromFreeproxy(const QWebElement _FindElement);
QString getIpListFromXroxy(const QWebElement _FindElement);
QString getIpListFromCoolProxy(const QWebElement _FindElement);
QString getIpListFromGatherProxy();
void getIpListFromGatherProxy();
QString addSource(QString _strIpList, QString _strSource);