Files
clients/ProxyProcess/scrawler.cpp
admin a3dda4113e 에러메세지 처리
SendIpList 수행


git-svn-id: svn://192.168.0.12/source@156 8346c931-da38-4b9b-9d4c-e48b93cbd075
2015-07-06 09:56:20 +00:00

568 lines
16 KiB
C++

#include "scrawler.h"
#include <iostream>
#include <QSqlQuery>
#include <QSqlError>
#include <QByteArray>
#include <qDebug>
using namespace std;
struct SProxyList
{
QString m_strAddress;
int m_nPort;
};
SCrawler::SCrawler():QObject()
{
m_page = new QWebPage;
connect(m_page, SIGNAL(loadFinished(bool)), this, SLOT(saveResult(bool)));
// p_timer = new QTimer(this);
// connect(p_timer,SIGNAL(timeout()), this, SLOT(killProcess()));
QTimer::singleShot(90000, this, SLOT(killProcess()));
m_bCrawled = false;
}
SCrawler::~SCrawler()
{
}
void SCrawler::load(QStringList _strlistArgv)
{
QUrl url(_strlistArgv.at(0));
QNetworkRequest *request = new QNetworkRequest;
m_strUrl = _strlistArgv.at(0);
m_page->settings()->setAttribute(QWebSettings::AutoLoadImages, true);
m_page->settings()->setAttribute(QWebSettings::JavaEnabled, true);
if(_strlistArgv.length() > 1)
m_strFolder = _strlistArgv.at(1).trimmed() + "\\";
if(_strlistArgv.length() > 2)
m_strLocation = _strlistArgv.at(2).trimmed();
else
m_strLocation = "local";
m_page->mainFrame()->load(url);
}
void SCrawler::UpdateError(QString _strError)
{
m_bError = true;
}
void SCrawler::saveResult(bool ok)
{
if (!ok)
{
cout << "Failed loading";
emit finished();
return;
}
Debug("c:/data/test3.html", m_page->currentFrame()->toHtml());
QString strIpList;
if(m_strUrl.contains("hidemyass"))
{
QWebElement p_parse = Find(m_page->currentFrame()->documentElement(),"div","class","row inner");
if(!p_parse.isNull())
{
strIpList = getIpListFromAss(p_parse);
if(strIpList.trimmed().size() > 0 )
strIpList = addSource(strIpList, "hidemyass.com");
}
}
else if(m_strUrl.contains("nordvpn"))
{
QWebElement p_parse = Find(m_page->currentFrame()->documentElement(),"table","class","list");
if(!p_parse.isNull())
{
strIpList = getIpListFromNordVpn(p_parse);
if(strIpList.trimmed().size() > 0 )
strIpList = addSource(strIpList, "nordvpn.com");
}
}
else if(m_strUrl.contains("cyber"))
{
QWebElement p_parse = FindLeft(m_page->currentFrame()->documentElement(),"ol","style","list");
if(!p_parse.isNull())
{
strIpList = getIpListFromCyberSyndrom(p_parse);
if(strIpList.trimmed().size() > 0 )
strIpList = addSource(strIpList, "cybersyndrome.net");
}
}
else if(m_strUrl.contains("proxylists"))
{
QWebElement p_parse = m_page->mainFrame()->findFirstElement("table");
if(!p_parse.isNull())
{
strIpList = getIpListFromProxylists(p_parse);
if(strIpList.trimmed().size() > 0 )
strIpList = addSource(strIpList, "proxylists.net");
}
}
if(strIpList.trimmed().size() > 0)
cout << strIpList.trimmed().toStdString();
// success to crawling
if(strIpList.size() > 8)
{
// in case sending iplist to db
if(m_strLocation.compare("local") != 0)
{
// success to send ip list to db
if(SendIpList(strIpList))
{
cout << endl << "uok";
}
// fail to sen ip list to db
else
{
cout << endl << "fok";
}
}
// in case not sending iplist to db
else
{
cout << endl << "ok";
}
}
// fail to crawling
else
{
cout << "sitedown";
}
emit finished();
}
int SCrawler::GetNumber(QString _str)
{
QString strNumber;
for (int i = 0; i < _str.size();i++)
{
if (_str.at(i).isNumber())
strNumber += _str.at(i);
}
return strNumber.toInt();
}
bool SCrawler::Debug(QString _strFilename,QString _strData)
{
QFile file(_strFilename);
QFile::remove(_strFilename);
if (!file.open(QIODevice::WriteOnly | QIODevice::Text | QIODevice::Append))
{
// if (!file.open(QIODevice::WriteOnly | QIODevice::Text | QIODevice::Truncate))
cout << "savefail" << endl;
emit finished();
return false;
}
QTextStream out(&file);
out << _strData;
file.close();
return true;
}
QString SCrawler::SqlString(QString _str)
{
_str = _str.replace("'","\\'");
_str = _str.replace("\"","\\\"");
return _str;
}
QString SCrawler::GetSafeUtf(QString _strData)
{
QString str;
QChar *pch = _strData.data();
for (int i = 0; i < _strData.length(); i++)
{
if (pch[i].unicode() >= 12593 && pch[i].unicode() <= 12622)
str += pch[i];
if (pch[i].unicode() >= 44032 && pch[i].unicode() <= 55203)
str += pch[i];
if (pch[i].isDigit() || pch[i].isNumber() || pch[i].isSpace() || pch[i].isLower() || pch[i].isUpper() || pch[i].isSymbol() )
str += pch[i];
}
return str;
}
QWebElement SCrawler::Find(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind)
{
QWebElementCollection elements = _FindElement.findAll(_strElement);
foreach (QWebElement element, elements)
{
if (element.attribute(_strAttrib) == _strFind)
{
return element;
}
}
QWebElement element;
return element;
}
QWebElement SCrawler::FindMid(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind, const int _strStart, const int _strLength)
{
QWebElementCollection elements = _FindElement.findAll(_strElement);
foreach (QWebElement element, elements)
{
QString str = element.attribute(_strAttrib).trimmed().mid(_strStart,_strLength);
if (QString::compare(str,_strFind,Qt::CaseInsensitive)==0)
{
return element;
}
}
QWebElement element;
return element;
}
QWebElement SCrawler::FindMid(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind, const int _strStart)
{
int _strLength = _strFind.length();
return FindMid(_FindElement, _strElement, _strAttrib, _strFind, _strStart, _strLength);
}
QWebElement SCrawler::FindRight(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind)
{
QWebElementCollection elements = _FindElement.findAll(_strElement);
foreach (QWebElement element, elements)
{
QString str = element.attribute(_strAttrib).trimmed().right(_strFind.length());
if (QString::compare(str,_strFind,Qt::CaseInsensitive)==0)
{
return element;
}
}
QWebElement element;
return element;
}
QWebElement SCrawler::FindLeft(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind)
{
int _strStart = 0;
return FindMid(_FindElement, _strElement, _strAttrib, _strFind, _strStart);
}
QString SCrawler::getIpListFromAss(const QWebElement _FindElement)
{
QString DISPLAYINLINE = "{display:inline}";
QString DISPLAYNONE = "{display:none}";
QString totalResult;
QWebElementCollection trs = _FindElement.findAll("tr");
for(int i = 1; i < trs.count(); i++)
{
QWebElementCollection tds = trs.at(i).findAll("td");
QWebElement firstStyle = trs.at(i).findFirst("style");
QString strFirstStyle = firstStyle.toInnerXml().trimmed();
QStringList disinline;
QStringList disnone;
// {display:inline} {display:none}
// Find display:inline list
// Find display:none list
QStringList strlist = strFirstStyle.split(".", QString::SkipEmptyParts);
foreach(QString str, strlist)
{
if(str.trimmed().right(DISPLAYINLINE.length()).compare(DISPLAYINLINE) == 0)
{
disinline.append(str.trimmed().left(str.trimmed().length() - DISPLAYINLINE.length()));
//cout << DISPLAYINLINE.toStdString() << " : " << str.trimmed().left(str.trimmed().length() - DISPLAYINLINE.length()).toStdString() << endl;
}
else
{
disnone.append(str.trimmed().left(str.trimmed().length() - DISPLAYNONE.length()));
//cout << DISPLAYNONE.toStdString() << " : " << str.trimmed().left(str.trimmed().length() - DISPLAYNONE.length()).toStdString() << endl;
}
}
//Change class -> display:inline
//Change class -> display:none
QString strIpContents = tds.at(1).toInnerXml();
foreach(QString str, disinline)
{
strIpContents.replace(str.trimmed(),"display: inline");
}
foreach(QString str, disnone)
{
strIpContents.replace(str.trimmed(),"display:none");
}
//cout << strIpContents.toStdString() << endl;
//Parsing contents and extracting ip and port each table row
QStringList strIpContentsList = strIpContents.split(QRegExp("[<>]"), QString::SkipEmptyParts);
QString result;
for(int j = 0; j < strIpContentsList.count(); j++)
{
QString str1 = strIpContentsList.at(j).trimmed();
if(str1.at(0) == '/')
continue;
if(str1.left(4).compare("span") == 0)
continue;
if(str1.left(3).compare("div") == 0)
continue;
if(str1.left(5).compare("style") == 0)
continue;
if(str1.mid(1,3).compare("dis") == 0)
continue;
if(j > 0)
{
QString str2 = strIpContentsList.at(j-1).trimmed();
if(str2.right(5).compare("none\"") == 0)
continue;
}
result += str1;
}
//cout << strIpContentsList.length() << endl;
//cout << result.toStdString() << endl;
//Debug("proxy.txt",result);
QString strport = tds.at(2).toInnerXml().trimmed();
result += ("," + strport);
totalResult += result;
if(i < trs.count() - 1)
totalResult += "\n";
}
return totalResult;
}
QString SCrawler::getIpListFromNordVpn(const QWebElement _FindElement)
{
QWebElement tbody = _FindElement.findFirst("tbody");
QWebElementCollection trs = tbody.findAll("tr");
QString totalResult;
for(int i = 0; i < trs.count(); i++)
{
QWebElementCollection ths = trs.at(i).findAll("th");
if(ths.count() < 4)
continue;
QString strip = ths.at(2).toPlainText().trimmed();
QString strport = ths.at(3).toPlainText().trimmed();
totalResult += strip;
totalResult += ",";
totalResult += strport;
if(i < trs.count() - 1)
totalResult += "\n";
}
return totalResult;
}
QString SCrawler::getIpListFromCyberSyndrom(const QWebElement _FindElement)
{
QString totalResult;
QWebElementCollection lis = _FindElement.findAll("li");
for(int i = 0; i < lis.count(); i++)
{
QString str = lis.at(i).toPlainText().trimmed().replace(":",",");
totalResult += str;
if(i < lis.count() - 1)
totalResult += "\n";
}
return totalResult;
}
QString SCrawler::getIpListFromProxylists(const QWebElement _FindElement)
{
QString totalResult;
QWebElement table = _FindElement.findFirst("table");
QWebElementCollection trs = table.findAll("tr");
for(int i = 0; i < trs.count() - 1; i++)
{
QWebElementCollection tds = trs.at(i).findAll("td");
if(tds.count() < 4)
continue;
if(tds.at(2).toPlainText().compare("anonymous",Qt::CaseInsensitive) != 0)
continue;
QString ip = tds.at(0).toPlainText();
QString port = tds.at(1).toPlainText();
totalResult += ip.trimmed() + "," + port.trimmed() + "\n";
}
return totalResult.trimmed();
}
/*
bool SCrawler::SendIpList(QString _strIpList)
{
QSqlDatabase db = QSqlDatabase::addDatabase("QMYSQL");
db.setHostName("bigbird.iptime.org");
db.setUserName("admin");
db.setPassword("admin123");
db.setDatabaseName("concepters");
if (db.open() == false)
{
cout << "error : db open fail..." << endl;
cout << "dbcfail";
emit finished();
return false;
}
QStringList strListIpList = _strIpList.split('\n', QString::SkipEmptyParts);
if(strListIpList.size() < 1)
{
cout << "sitedown";
return false;
}
QSqlQuery sql;
//QString strQuery = "truncate table Proxy";
QString strQuery = "delete from Proxy";
QString strUtf8(strQuery.toUtf8());
if (sql.exec(strUtf8) == false)
{
cout << "error : " << sql.lastError().text().toStdString();
cout << endl << "dbdfail";
//emit finished();
//return;
}
foreach(QString str, strListIpList)
{
strQuery = "insert into Proxy set Proxy='";
strQuery += str.split(',').at(0).trimmed();
strQuery += "', Port=";
strQuery += str.split(',').at(1).trimmed();
strUtf8 = strQuery.toUtf8();
if (sql.exec(strUtf8) == false)
{
cout << "error : " << sql.lastError().text().toStdString();
cout << endl << "dbufail";
return false;
}
}
return true;
}
*/
bool SCrawler::SendIpList(QString _strIpList)
{
QSqlDatabase db = QSqlDatabase::addDatabase("QMYSQL");
db.setHostName("bigbird.iptime.org");
db.setUserName("admin");
db.setPassword("admin123");
db.setDatabaseName("concepters");
if (db.open() == false)
{
qDebug() << "DB open Failed in SendIpList()";
return false;
}
QSqlQuery sql;
//QString strQuery = "truncate table Proxy";
/*
QString strQuery = "delete from Proxy";
QString strUtf8(strQuery.toUtf8());
if (sql.exec(strUtf8) == false)
{
p_labelStatus->setText("Delete Query\n Fail");
return false;
}
*/
QString strQuery;
QString strUtf8;
QStringList _slIpList = _strIpList.split("\n");
foreach(QString str, _slIpList)
{
strQuery = "insert into Proxy set Proxy='";
strQuery += str.split(',').at(0).trimmed();
strQuery += "', Port=";
strQuery += str.split(',').at(1).trimmed();
if(str.split(',').size() > 2)
{
strQuery += ", Source='";
strQuery += str.split(',').at(2).trimmed();
strQuery += "'";
}
strUtf8 = strQuery.toUtf8();
if (sql.exec(strUtf8) == false)
{
//InsertLog(sql.lastQuery() + "is Failed");
//return false;
cerr << sql.lastQuery().toStdString() << endl;
//cout << "PP send ip list failed" << endl;
}
}
//cout << "PP send ip list : " << _slIpList.size() << endl;
db.close();
return true;
}
QList<QWebElement> SCrawler::FindAllMid(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind, const int _strStart, const int _strLength)
{
QWebElementCollection elements = _FindElement.findAll(_strElement);
QList<QWebElement> returnElements = QList<QWebElement>();
foreach (QWebElement element, elements)
{
QString str = element.attribute(_strAttrib).trimmed().mid(_strStart,_strLength);
if (QString::compare(str,_strFind,Qt::CaseInsensitive)==0)
{
returnElements.append(element);
}
}
return returnElements;
}
void SCrawler::killProcess()
{
cout << endl << "timeout";
emit finished();
}
void SCrawler::SearchChildFrame(QWebFrame *frame)
{
Debug("c:/data/nordvpnloop.html", frame->toHtml());
foreach(QWebFrame *childFrame, frame->childFrames())
SearchChildFrame(childFrame);
}
QString SCrawler::addSource(QString _strIpList, QString _strSource)
{
QStringList strlistIpList = _strIpList.split("\n");
QStringList straddedList;
foreach(QString str, strlistIpList)
{
straddedList << (str + "," + _strSource);
}
return straddedList.join("\n").trimmed();
}