268 lines
8.3 KiB
C++
268 lines
8.3 KiB
C++
#include "scrawler.h"
|
|
#include <iostream>
|
|
#include <QSqlQuery>
|
|
#include <QSqlError>
|
|
#include <QByteArray>
|
|
#include <qDebug>
|
|
|
|
using namespace std;
|
|
|
|
struct SProxyList
|
|
{
|
|
QString m_strAddress;
|
|
int m_nPort;
|
|
};
|
|
|
|
SCrawler::SCrawler():QObject()
|
|
{
|
|
m_page = new QWebPage;
|
|
connect(m_page, SIGNAL(loadFinished(bool)), this, SLOT(saveResult(bool)));
|
|
}
|
|
|
|
SCrawler::~SCrawler()
|
|
{
|
|
}
|
|
|
|
void SCrawler::load(QStringList _strlistArgv)
|
|
{
|
|
QUrl url(_strlistArgv.at(0));
|
|
|
|
m_page->settings()->setAttribute(QWebSettings::AutoLoadImages, false);
|
|
m_page->settings()->setAttribute(QWebSettings::JavaEnabled, false);
|
|
|
|
|
|
if(_strlistArgv.length() > 1)
|
|
m_strFolder = _strlistArgv.at(1) + "\\";
|
|
|
|
m_page->currentFrame()->load(url);
|
|
}
|
|
|
|
void SCrawler::UpdateError(QString _strError)
|
|
{
|
|
|
|
m_bError = true;
|
|
}
|
|
|
|
void SCrawler::saveResult(bool ok)
|
|
{
|
|
if (!ok)
|
|
{
|
|
std::cerr << "Failed loading " << qPrintable(m_page->mainFrame()->url().toString()) << std::endl;
|
|
emit finished();
|
|
return;
|
|
}
|
|
|
|
QWebElement p_parse = Find(m_page->currentFrame()->documentElement(),"div","class","row inner");
|
|
|
|
//Debug("test.html",m_page->currentFrame()->toHtml());
|
|
//Debug("filtered.html", p_parse.toInnerXml());
|
|
|
|
QWebElementCollection trs = p_parse.findAll("tr");
|
|
QString DISPLAYINLINE = "{display:inline}";
|
|
QString DISPLAYNONE = "{display:none}";
|
|
QString totalResult;
|
|
|
|
for(int i = 1; i < trs.count(); i++)
|
|
{
|
|
QWebElementCollection tds = trs.at(i).findAll("td");
|
|
|
|
QWebElement firstStyle = trs.at(i).findFirst("style");
|
|
|
|
QString strFirstStyle = firstStyle.toInnerXml().trimmed();
|
|
|
|
QStringList disinline;
|
|
QStringList disnone;
|
|
// {display:inline} {display:none}
|
|
QStringList strlist = strFirstStyle.split(".", QString::SkipEmptyParts);
|
|
|
|
foreach(QString str, strlist)
|
|
{
|
|
if(str.trimmed().right(DISPLAYINLINE.length()).compare(DISPLAYINLINE) == 0)
|
|
{
|
|
disinline.append(str.trimmed().left(str.trimmed().length() - DISPLAYINLINE.length()));
|
|
// cout << DISPLAYINLINE.toStdString() << " : " << str.trimmed().left(str.trimmed().length() - DISPLAYINLINE.length()).toStdString() << endl;
|
|
}
|
|
else
|
|
{
|
|
disnone.append(str.trimmed().left(str.trimmed().length() - DISPLAYNONE.length()));
|
|
// cout << DISPLAYNONE.toStdString() << " : " << str.trimmed().left(str.trimmed().length() - DISPLAYNONE.length()).toStdString() << endl;
|
|
}
|
|
}
|
|
|
|
QString strIpContents = tds.at(1).toInnerXml();
|
|
foreach(QString str, disinline)
|
|
{
|
|
strIpContents.replace(str.trimmed(),"display: inline");
|
|
}
|
|
foreach(QString str, disnone)
|
|
{
|
|
strIpContents.replace(str.trimmed(),"display:none");
|
|
}
|
|
|
|
// cout << strIpContents.toStdString() << endl;
|
|
QStringList strIpContentsList = strIpContents.split(QRegExp("[<>]"),QString::SkipEmptyParts);
|
|
QString result;
|
|
for(int j = 0; j < strIpContentsList.count(); j++)
|
|
{
|
|
QString str1 = strIpContentsList.at(j).trimmed();
|
|
|
|
if(str1.at(0) == '/')
|
|
continue;
|
|
if(str1.left(4).compare("span") == 0)
|
|
continue;
|
|
if(str1.left(3).compare("div") == 0)
|
|
continue;
|
|
if(str1.left(5).compare("style") == 0)
|
|
continue;
|
|
if(str1.mid(1,3).compare("dis") == 0)
|
|
continue;
|
|
|
|
if(j > 0)
|
|
{
|
|
QString str2 = strIpContentsList.at(j-1).trimmed();
|
|
if(str2.right(5).compare("none\"") == 0)
|
|
continue;
|
|
}
|
|
|
|
result += str1;
|
|
}
|
|
|
|
//cout << strIpContentsList.length() << endl;
|
|
cout << result.toStdString() << endl;
|
|
//Debug("proxy.txt",result);
|
|
QString port = tds.at(2).toInnerXml().trimmed();
|
|
result += ("," + port);
|
|
totalResult += result;
|
|
if(i < trs.count() - 1)
|
|
totalResult += "\n";
|
|
}
|
|
|
|
Debug(m_strFolder + "proxy.txt",totalResult);
|
|
|
|
emit finished();
|
|
}
|
|
|
|
int SCrawler::GetNumber(QString _str)
|
|
{
|
|
QString strNumber;
|
|
for (int i = 0; i < _str.size();i++)
|
|
{
|
|
if (_str.at(i).isNumber())
|
|
strNumber += _str.at(i);
|
|
}
|
|
return strNumber.toInt();
|
|
}
|
|
|
|
void SCrawler::Debug(QString _strFilename,QString _strData)
|
|
{
|
|
QFile file(_strFilename);
|
|
QFile::remove(_strFilename);
|
|
if (!file.open(QIODevice::WriteOnly | QIODevice::Text | QIODevice::Append))
|
|
// if (!file.open(QIODevice::WriteOnly | QIODevice::Text | QIODevice::Truncate))
|
|
return;
|
|
QTextStream out(&file);
|
|
out << _strData;
|
|
file.close();
|
|
}
|
|
|
|
QString SCrawler::SqlString(QString _str)
|
|
{
|
|
_str = _str.replace("'","\\'");
|
|
_str = _str.replace("\"","\\\"");
|
|
return _str;
|
|
}
|
|
|
|
|
|
QString SCrawler::GetSafeUtf(QString _strData)
|
|
{
|
|
QString str;
|
|
QChar *pch = _strData.data();
|
|
|
|
for (int i = 0; i < _strData.length(); i++)
|
|
{
|
|
if (pch[i].unicode() >= 12593 && pch[i].unicode() <= 12622)
|
|
str += pch[i];
|
|
if (pch[i].unicode() >= 44032 && pch[i].unicode() <= 55203)
|
|
str += pch[i];
|
|
if (pch[i].isDigit() || pch[i].isNumber() || pch[i].isSpace() || pch[i].isLower() || pch[i].isUpper() || pch[i].isSymbol() )
|
|
str += pch[i];
|
|
}
|
|
return str;
|
|
}
|
|
|
|
QWebElement SCrawler::Find(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind)
|
|
{
|
|
QWebElementCollection elements = _FindElement.findAll(_strElement);
|
|
foreach (QWebElement element, elements)
|
|
{
|
|
if (element.attribute(_strAttrib) == _strFind)
|
|
{
|
|
return element;
|
|
}
|
|
}
|
|
QWebElement element;
|
|
return element;
|
|
}
|
|
|
|
QWebElement SCrawler::FindMid(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind, const int _strStart, const int _strLength)
|
|
{
|
|
QWebElementCollection elements = _FindElement.findAll(_strElement);
|
|
foreach (QWebElement element, elements)
|
|
{
|
|
QString str = element.attribute(_strAttrib).trimmed().mid(_strStart,_strLength);
|
|
if (QString::compare(str,_strFind,Qt::CaseInsensitive)==0)
|
|
{
|
|
return element;
|
|
}
|
|
}
|
|
QWebElement element;
|
|
return element;
|
|
}
|
|
|
|
QWebElement SCrawler::FindMid(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind, const int _strStart)
|
|
{
|
|
int _strLength = _strFind.length();
|
|
return FindMid(_FindElement, _strElement, _strAttrib, _strFind, _strStart, _strLength);
|
|
}
|
|
|
|
QWebElement SCrawler::FindRight(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind)
|
|
{
|
|
QWebElementCollection elements = _FindElement.findAll(_strElement);
|
|
foreach (QWebElement element, elements)
|
|
{
|
|
QString str = element.attribute(_strAttrib).trimmed().right(_strFind.length());
|
|
cout << "FindRight : " << str.toStdString() << endl;
|
|
cout << "FindRight right : " << _strFind.toStdString() << endl;
|
|
if (QString::compare(str,_strFind,Qt::CaseInsensitive)==0)
|
|
{
|
|
return element;
|
|
}
|
|
}
|
|
QWebElement element;
|
|
return element;
|
|
}
|
|
|
|
QWebElement SCrawler::FindLeft(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind)
|
|
{
|
|
int _strStart = 0;
|
|
return FindMid(_FindElement, _strElement, _strAttrib, _strFind, _strStart);
|
|
}
|
|
|
|
|
|
|
|
QList<QWebElement> SCrawler::FindAllMid(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind, const int _strStart, const int _strLength)
|
|
{
|
|
QWebElementCollection elements = _FindElement.findAll(_strElement);
|
|
QList<QWebElement> returnElements = QList<QWebElement>();
|
|
|
|
foreach (QWebElement element, elements)
|
|
{
|
|
QString str = element.attribute(_strAttrib).trimmed().mid(_strStart,_strLength);
|
|
if (QString::compare(str,_strFind,Qt::CaseInsensitive)==0)
|
|
{
|
|
returnElements.append(element);
|
|
}
|
|
}
|
|
return returnElements;
|
|
}
|