Files
clients/CrawlerProcess/scrawler.cpp
admin dfe78744bf proxy limit 추가
git-svn-id: svn://192.168.0.12/source@142 8346c931-da38-4b9b-9d4c-e48b93cbd075
2015-06-04 08:12:05 +00:00

1728 lines
68 KiB
C++

#include "scrawler.h"
#include <iostream>
#include <QSqlQuery>
#include <QSqlError>
#include <QByteArray>
#include <QDebug>
using namespace std;
struct SProxyList
{
QString m_strAddress;
int m_nPort;
};
SCrawler::SCrawler():QObject()
{
m_page = new QWebPage;
connect(m_page, SIGNAL(loadFinished(bool)), this, SLOT(saveResult(bool)));
}
SCrawler::~SCrawler()
{
}
void SCrawler::load(QStringList _strlistArgv)
{
m_bUse = false;
if (_strlistArgv[0] == "naver")
{
if (_strlistArgv[1] == "cafe_list")
{
m_strUrl = _strlistArgv[2];
m_nSelect = E_NAVER_CAFE_LIST;
m_strKeywordID = _strlistArgv[4];
setProxy();
}
if (_strlistArgv[1] == "cafe_data")
{
m_strUrl = _strlistArgv[2];
m_nSelect = E_NAVER_CAFE_DATA;
m_strReper = _strlistArgv[4];
m_strKeywordID = _strlistArgv[5];
}
if (_strlistArgv[1] == "blog_list")
{
m_strUrl = _strlistArgv[2];
m_nSelect = E_NAVER_BLOG_LIST;
m_strKeywordID = _strlistArgv[4];
setProxy();
}
if (_strlistArgv[1] == "blog_url")
{
m_strUrl = _strlistArgv[2];
m_nSelect = E_NAVER_BLOG_BODY;
m_strKeywordID = _strlistArgv[4];
m_bUse = true;
}
if (_strlistArgv[1] == "blog_comm")
{
m_strUrl = _strlistArgv[2];
m_nSelect = E_NAVER_BLOG_REPLY;
}
m_strTable = "data_" + _strlistArgv[3];
}
if (_strlistArgv[0] == "daum")
{
if (_strlistArgv[1] == "cafe_list")
{
m_strUrl = _strlistArgv[2];
m_nSelect = E_DAUM_CAFE_LIST;
m_strKeywordID = _strlistArgv[4];
setProxy();
}
if (_strlistArgv[1] == "cafe_data")
{
m_strUrl = _strlistArgv[2];
m_nSelect = E_DAUM_CAFE_DATA;
m_strReper = _strlistArgv[4];
m_strKeywordID = _strlistArgv[5];
}
if (_strlistArgv[1] == "blog_list")
{
m_strUrl = _strlistArgv[2];
m_nSelect = E_DAUM_BLOG_LIST;
m_strKeywordID = _strlistArgv[4];
//cout << "ok";
setProxy();
}
if (_strlistArgv[1] == "blog_url")
{
m_strUrl = _strlistArgv[2];
m_nSelect = E_DAUM_BLOG_BODY;
//m_strReper = _strlistArgv[4];
m_bUse = true;
}
if (_strlistArgv[1] == "blog_comm")
{
m_strUrl = _strlistArgv[2];
m_nSelect = E_DAUM_BLOG_REPLY;
}
m_strTable = "data_" + _strlistArgv[3];
}
cout << m_strUrl.toStdString() << endl;
QUrl url = QUrl(m_strUrl);
if (url.scheme().isEmpty())
url.setScheme("http");
m_page->settings()->setAttribute(QWebSettings::AutoLoadImages,false);
QNetworkRequest *request = new QNetworkRequest;
request->setUrl(url);
/*
request->setRawHeader("Cache-Control","max-age=0, no-cache");
request->setRawHeader("Pragma","no-cache");
request->setRawHeader("Expires","Thu, 01 Jan 1970 16:00:00 GMT");
*/
if (m_strReper.isEmpty() == false && m_nSelect == E_NAVER_CAFE_DATA)
request->setRawHeader("Referer",m_strReper.toLocal8Bit());
request->setRawHeader("Accept-Language","ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4,zh-CN;q=0.2,zh;q=0.2");
m_page->mainFrame()->load(*request);
m_bLast = false;
m_bError = false;
}
void SCrawler::UpdateError(QString _strError)
{
/*
QSqlQuery sql;
_strError = "'" + _strError + "'";
QString strQuery = "update " + m_strTableBody + " set ERROR = " + _strError;
strQuery += "where URL='";
strQuery += m_strUrl;
strQuery += "'";
QString strUtf8(strQuery.toUtf8());
sql.exec(strUtf8);
*/
m_bError = true;
}
void SCrawler::saveResult(bool ok)
{
if (!ok)
{
cout << "Failed loading";
deleteProxy();
emit finished();
return;
}
switch(m_nSelect)
{
case E_NAVER_CAFE_LIST:saveFrameCafeList(m_page->mainFrame());break;
case E_NAVER_CAFE_DATA:
{
saveFrameCafeUrl(m_page->mainFrame());
bodydata.sendDB();
break;
}
case E_NAVER_BLOG_LIST:saveFrameList(m_page->mainFrame());break;
case E_NAVER_BLOG_BODY:
{
saveFrameUrl(m_page->mainFrame());
bodydata.sendDB();
break;
}
case E_NAVER_BLOG_REPLY:saveFrameComment(m_page->mainFrame());break;
case E_DAUM_CAFE_LIST:saveFrameDaumCafeList(m_page->mainFrame());break;
case E_DAUM_CAFE_DATA:
{
saveFrameDaumCafeUrl(m_page->mainFrame());
bodydata.sendDB();
break;
}
case E_DAUM_BLOG_LIST:saveFrameDaumBlogList(m_page->mainFrame());break;
case E_DAUM_BLOG_BODY:
{
saveFrameDaumBlogUrl(m_page->mainFrame());
bodydata.sendDB();
break;
}
case E_DAUM_BLOG_REPLY:saveFrameDaumBlogComment(m_page->mainFrame());break;
}
switch(m_nSelect)
{
case E_NAVER_CAFE_LIST:
case E_NAVER_BLOG_LIST:
case E_DAUM_CAFE_LIST:
case E_DAUM_BLOG_LIST:
if (m_bError)
{
cout << "block";// block
deleteProxy();
break;
}
if (m_bLast) cout << "last";
break;
case E_NAVER_BLOG_REPLY:
case E_DAUM_BLOG_REPLY:
cout << "ok";
break;
case E_NAVER_CAFE_DATA:
case E_NAVER_BLOG_BODY:
case E_DAUM_CAFE_DATA:
case E_DAUM_BLOG_BODY:
if (m_bUse == false)
{
cout << "fail";
UpdateError("Error code 0");
}
else
{
if (m_bError == false)
{
cout << "ok";
UpdateError("ok");
}
}
break;
}
emit finished();
}
int SCrawler::GetNumber(QString _str)
{
QString strNumber;
for (int i = 0; i < _str.size();i++)
{
if (_str.at(i).isNumber())
strNumber += _str.at(i);
}
return strNumber.toInt();
}
void SCrawler::Debug(QString _strFilename,QString _strData)
{
QFile file(_strFilename);
if (!file.open(QIODevice::WriteOnly | QIODevice::Text | QIODevice::Append))
return;
QTextStream out(&file);
out << _strData;
file.close();
}
QString SCrawler::SqlString(QString _str)
{
_str = _str.replace("'","\\'");
_str = _str.replace("\"","\\\"");
return _str;
}
QString SCrawler::GetSafeUtf(QString _strData)
{
QString str;
QChar *pch = _strData.data();
for (int i = 0; i < _strData.length(); i++)
{
if (pch[i].unicode() >= 12593 && pch[i].unicode() <= 12622)
str += pch[i];
if (pch[i].unicode() >= 44032 && pch[i].unicode() <= 55203)
str += pch[i];
if (pch[i].isDigit() || pch[i].isNumber() || pch[i].isSpace() || pch[i].isLower() || pch[i].isUpper() || pch[i].isSymbol() )
str += pch[i];
}
return str;
}
void SCrawler::saveFrameList(QWebFrame *frame)
{
if (m_bUse == true) return;
QWebElement notFound = Find(frame->documentElement(),"div","id","notfound");
if(notFound.isNull() == false)
{
m_bLast = true;
return;
}
QWebElement eleMain = Find(frame->documentElement(),"div","class","blog section _blogBase");
QSqlQuery sql;
QStringList urlList;
for (int i = 0; i < 10 ; i++)
{
QString str = "sp_blog_";
QWebElement sub = Find(eleMain,"li","id",str+QString::number(i+1));
QString strUrl = Find(sub,"a","class","url").toPlainText();
if (strUrl.isEmpty())
{
//m_bLast = true;
//m_bUse = true;
break;
}
QStringList strList = strUrl.split('/');
QString strBlogMe = "blog.me";
if ((strList.at(0).compare("blog.naver.com") != 0 ) && (strList.at(0).right(strBlogMe.length()).compare(strBlogMe) != 0))
{
continue;
}
if((strList.at(0).right(strBlogMe.length()).compare(strBlogMe) == 0))
{
QStringList strSubList = strList.at(0).split('.');
strUrl = "blog.naver.com/";
strUrl += strSubList.at(0);
strUrl += "/";
strUrl += strList.at(1);
}
urlList << QString("http://%1").arg(strUrl);
}
if(urlList.size() > 0)
{
QString strUrlList;
strUrlList = "(";
foreach(QString str, urlList)
{
strUrlList += "'";
strUrlList += str;
strUrlList += "',";
}
strUrlList = strUrlList.left(strUrlList.size() - 1);
strUrlList += ")";
QSqlQuery sql;
QString strQuery = "delete from ";
strQuery += m_strTable;
strQuery += QString(" where article_url in %1").arg(strUrlList);
//qDebug() << strQuery;
if (sql.exec(strQuery.toUtf8()) == false)
{
cout << "error " << sql.lastError().text().toStdString();
cout << strQuery.toStdString();
}
}
for (int i = 0; i < 10 ; i++)
{
QString str = "sp_blog_";
QWebElement sub = Find(eleMain,"li","id",str+QString::number(i+1));
QString strUrl = Find(sub,"a","class","url").toPlainText();
if (strUrl.isEmpty())
{
//m_bLast = true;
m_bUse = true;
break;
}
QStringList strList = strUrl.split('/');
QString strBlogMe = "blog.me";
if ((strList.at(0).compare("blog.naver.com") != 0 ) && (strList.at(0).right(strBlogMe.length()).compare(strBlogMe) != 0))
{
cout << "x http://" << strUrl.toStdString() <<endl; continue;
}
if((strList.at(0).right(strBlogMe.length()).compare(strBlogMe) == 0))
{
QStringList strSubList = strList.at(0).split('.');
strUrl = "blog.naver.com/";
strUrl += strSubList.at(0);
strUrl += "/";
strUrl += strList.at(1);
}
/*
QString strQuery = "select article_url from ";
strQuery += m_strTable;
strQuery += QString(" where article_url = 'http://%1'").arg(strUrl);
sql.exec(strQuery);
*/
//if (sql.size() == 0 || sql.size() == -1)
{
QString str = Find(sub,"a","class","txt84").toPlainText();
str = GetSafeUtf(str);
str.replace("'","\\'");
str.replace("\"","\\\"");
str = str.trimmed();
QString strPlatformId;
if(strUrl.split("/").at(0).compare("blog.naver.com") == 0)
strPlatformId = strUrl.split("/").at(1);
else
strPlatformId = strUrl.split("/").at(0).split(".").at(0);
/*
QString strQuery = QString("insert into ");
strQuery += m_strTable;
strQuery += QString(" set platform_name='naver',platform_form='blog',article_form='body',article_url='%1',platform_id='%2',platform_title='%3',keyword_id='%4'").arg("http://"+strUrl).arg(strPlatformId).arg(str).arg(m_strKeywordID);
QString strUtf8(strQuery.toUtf8());
if (sql.exec(strUtf8) == false)
cout << "error : " << sql.lastError().text().toStdString();
else
*/
cout << "o ";
}
//else
// cout << "v ";
cout << "http://" << strUrl.toStdString() << endl;
m_bUse = true;
}
{
QWebElement total = Find(eleMain,"span","class","title_num");
if (total.toPlainText().isEmpty()) {m_bError = true; return;}
int nTotal = GetNumber(total.toPlainText().split("/").at(1));
QStringList strList = m_strUrl.split("&");
int nNow = GetNumber(strList.at(strList.size() - 1).split("=").at(1));
if ((nNow + 10) > nTotal || nNow >= 1000)
m_bLast = true;
}
}
enum E_DATA
{
E_DATA_NICK=0,
E_DATA_ID,
E_DATA_TITLE,
E_DATA_DATE,
E_DATA_DATA,
E_DATA_PLATFORM_TITLE,
E_DATA_MAX,
};
void SCrawler::saveFrameUrl(QWebFrame *frame)
{
static int cz = 0;
// Debug(frame->frameName() + QString::number(cz++) + ".html",frame->toHtml());
QSqlQuery sql;
if (frame->frameName().compare(QString("BuddyConnectIframe")) == 0)
{
QWebElement profile = Find(frame->documentElement(),"div","class","profile_name");
QString str = profile.toPlainText().split("\n").at(0);
if (str.isEmpty() == false)
{
bodydata.setData(str, bodydata.ARTICLE_NICKNAME);
bodydata.setData(m_strUrl, bodydata.ARTICLE_URL);
/*
QString strQuery = "update " + m_strTable + " set article_nickname = '";
strQuery += str;
strQuery += "'";
strQuery += " where article_url='";
strQuery += m_strUrl;
strQuery += "'";
QString strUtf8(strQuery.toUtf8());
if (sql.exec(strUtf8)==false)
{
cout << "error : " << sql.lastError().text().toStdString();
UpdateError("Error code 1");
m_bUse = false;
}
*/
}
}
/*
QString strHtml2 = frame->toHtml();
QString strFind2 = "blogpfthumb";
int start = strHtml2.indexOf(strFind2);
cout << "start = " << start << endl;
QString str222 = strHtml2.mid(start,30);
if(start != -1)
cout << "start String = " << str222.toStdString() << endl;
QWebElement image2 = Find(frame->documentElement(),"div","id","blog-profile");
cout << "p class image = " << image2.toInnerXml().toStdString() << endl;
image2 = Find(image2,"a","href","#");
image2 = Find(image2,"img","alt","프로필 이미지");
cout << "outer image profile = " << image2.attribute("src").toStdString() << endl;
image2 = FindMid(frame->documentElement(),"img","src","http://blogpfthumb",0,18);
cout << "outer image findmid = " << image2.attribute("src").toStdString() << endl;
*/
if (frame->frameName().compare(QString("mainFrame")) == 0)
{
QString str[E_DATA_MAX];
QString strHead[E_DATA_MAX] = {"article_nickname","article_id","article_title","article_date","article_data","platform_title"};
QString strSympathy;
QString strProfile;
QWebElement proTitle = Find(frame->documentElement(),"meta","property","og:article:author");
str[E_DATA_PLATFORM_TITLE] = proTitle.attribute("content").split("|").at(1).trimmed();
if(str[E_DATA_PLATFORM_TITLE].length() > 0)
str[E_DATA_PLATFORM_TITLE] = GetSafeUtf(str[E_DATA_PLATFORM_TITLE]);
else
{
proTitle = Find(frame->documentElement(),"span","id","blogTitleName");
str[E_DATA_PLATFORM_TITLE] = GetSafeUtf(proTitle.toPlainText().trimmed());
}
QWebElement image;
QWebElement profile = Find(frame->documentElement(),"div","id","blog-profile");
{
QWebElement nick = Find(profile,"strong","id","nickNameArea");
if (nick.toPlainText().isEmpty()==false)
str[E_DATA_NICK] = nick.toPlainText();
if(str[E_DATA_NICK].isEmpty())
{
QString strHtml = frame->toHtml();
QString strFind = "var nickName = '";
int start = strHtml.indexOf(strFind);
if (start == -1)
{
cout << "error : nick name can not find and next again connect." << endl;
}
if (strHtml.at(start + strFind.length()) == QChar('\''))
{
cout << "error : nick name can not find and next again connect." << endl;
}
else
{
int end = strHtml.indexOf("'",start + strFind.length());
str[E_DATA_NICK] = strHtml.mid(start + strFind.length(),end-start-strFind.length());
}
}
str[E_DATA_NICK] = GetSafeUtf(str[E_DATA_NICK]);
if (m_strUrl.split("/").at(3) == str[E_DATA_NICK])
{
str[E_DATA_ID] = str[E_DATA_NICK];
}
else
{
QWebElement id = Find(profile,"span","class","itemfont col");
if (id.toPlainText().isEmpty()==false)
{
str[E_DATA_ID] = id.toPlainText();
str[E_DATA_ID] = str[E_DATA_ID].replace("(","").replace(")","");
}
if (str[E_DATA_ID].isEmpty())
{
if((m_strUrl.split("/").at(2).compare("blog.naver.com") == 0))
str[E_DATA_ID] = m_strUrl.split("/").at(3);
else
str[E_DATA_ID] = m_strUrl.split("/").at(2).split(".").at(0);
}
}
if(str[E_DATA_NICK].length() == 0)
str[E_DATA_NICK] = str[E_DATA_ID];
image = Find(profile,"img","alt","프로필 이미지");
strProfile = Find(profile, "p", "class", "caption align").toPlainText().trimmed();
//strSympathy = FindLeft(Find(frame->documentElement(),"p","class","postre"),"a","class","pcol2 _symList").toPlainText().split(" ").at(1);
/*
cout << "inner image = " << image.attribute("src").toStdString() << endl;
image = FindMid(profile,"img","src","http://blogpfthumb",0,18);
cout << "inner image FindMid = " << image.attribute("src").toStdString() << endl;
cout << "str[E_DATA_ID] = " << str[E_DATA_ID].toStdString() << ", str[E_DATA_NICK] = " << str[E_DATA_NICK].toStdString() << endl;
*/
}
{
QWebElement post = Find(frame->documentElement(),"div","id","postListBody");
QWebElement post_top = Find(post,"table","class","post-top");
{
QWebElement title = Find(post_top,"span","class","pcol1 itemSubjectBoldfont");
if (title.toPlainText().isEmpty()==false)
{
str[E_DATA_TITLE] = title.toPlainText();
str[E_DATA_TITLE] = GetSafeUtf(str[E_DATA_TITLE]);
}
}
{
QWebElement date = Find(post_top,"p","class","date fil5 pcol2 _postAddDate");
str[E_DATA_DATE] = date.toPlainText().trimmed().replace("/","-");
if ( str[E_DATA_DATE].isEmpty() == false)
{
str[E_DATA_DATE] += ":00";
}
else
{
UpdateError("Error code 4");
m_bUse = false;
}
}
{
QWebElement body = Find(post,"div","class","post-view pcol2 _param(1)");
if (body.toPlainText().isEmpty()==false)
{
str[E_DATA_DATA] = body.toPlainText();
str[E_DATA_DATA] = GetSafeUtf(str[E_DATA_DATA]);
}
}
}
//QString strHead[E_DATA_MAX] = {"article_nickname","article_id","article_title","article_date","article_data","platform_title"};
bodydata.setData(str[0].trimmed(), bodydata.ARTICLE_NICKNAME);
bodydata.setData(str[1].trimmed(), bodydata.ARTICLE_ID);
bodydata.setData(str[2].trimmed(), bodydata.ARTICLE_TITLE);
bodydata.setData(str[3].trimmed(), bodydata.ARTICLE_DATE);
bodydata.setData(str[4].trimmed(), bodydata.ARTICLE_DATA);
bodydata.setData(str[5].trimmed(), bodydata.PLATFORM_TITLE);
if(image.attribute("src").trimmed().length() != 0)
{
bodydata.setData(image.attribute("src").trimmed(), bodydata.ARTICLE_PROFILEURL);
}
strProfile = GetSafeUtf(strProfile);
if(strProfile.length() > 0)
{
bodydata.setData(strProfile, bodydata.ARTICLE_PROFILE);
}
bodydata.setData(m_strUrl, bodydata.ARTICLE_URL);
bodydata.setTable(m_strTable);
bodydata.setData("naver", bodydata.PLATFORM_NAME);
bodydata.setData("blog", bodydata.PLATFORM_FORM);
bodydata.setData("body", bodydata.ARTICLE_FORM);
bodydata.setData(m_strUrl.split("/").at(3), bodydata.PLATFORM_ID);
bodydata.setData(m_strKeywordID, bodydata.KEYWORD_ID);
/*
QString strQuery = "update " + m_strTable + " set ";
for(int i = 0; i < E_DATA_MAX ; i++)
{
strQuery += strHead[i];
strQuery += "='";
//strQuery += GetSafeUtf(SqlString(str[i].trimmed()));
strQuery += str[i].trimmed();
strQuery += "'";
if( i != (E_DATA_MAX - 1) )
strQuery += ",";
}
if(image.attribute("src").trimmed().length() != 0)
{
strQuery += ", ";
strQuery += "article_profileurl='";
strQuery += image.attribute("src").trimmed();
strQuery += "'";
}
strProfile = GetSafeUtf(strProfile);
if(strProfile.length() > 0)
{
strQuery += ", ";
strQuery += "article_profile='";
strQuery += strProfile;
strQuery += "'";
}
strQuery += " where article_url='";
strQuery += m_strUrl;
strQuery += "'";
//cout << "strQuery = " << strQuery.toStdString() << endl;
QString strUtf8(strQuery.toUtf8());
if (sql.exec(strUtf8)==false)
{
cout << "error : " << sql.lastError().text().toStdString();
UpdateError("Error code 5");
m_bUse = false;
}
*/
}
foreach(QWebFrame *childFrame, frame->childFrames())
saveFrameUrl(childFrame);
}
void SCrawler::saveFrameComment(QWebFrame *frame)
{
QWebElement group = Find(frame->documentElement(),"ul","id","commentList");
QWebElementCollection elements = group.findAll("li");
QString strParent,strDate,strNick,strComm,strUrl,strId;
QStringList strList = m_strUrl.split("/");
QString strCommUrl;
for (int i=0; i < 5; i++)
strUrl += strList.at(i) + "/";
strUrl = strUrl.left(strUrl.size()-1);
int nCount=0;
foreach (QWebElement element, elements)
{
if (element.attribute("class") == "_countableComment ")
{
strNick = strParent = GetSafeUtf(Find(element,"a","class","nick pcol2").toPlainText());
strDate = Find(element,"span","class","date fil5 pcol2").toPlainText();
strComm = Find(element,"dd","class","comm pcol2").toPlainText();
strCommUrl = Find(element,"a","class","nick pcol2").attribute("href");
if(strCommUrl.left(QString("http://blog.naver.com").length()).compare("http://blog.naver.com") == 0)
strId = strCommUrl.split("/").at(3).trimmed();
if(strCommUrl.right(QString("blog.me").length()).compare("blog.me") == 0)
strId = strCommUrl.split("/").at(2).split(".").at(0).trimmed();
if(strCommUrl.left(1) == "/")
{
QStringList strList = strCommUrl.split("&");
foreach(QString str, strList)
{
if(str.left(3) == "id=")
{
strId = str.right(str.length() - 3);
}
}
}
strComm = GetSafeUtf(strComm);
if (strComm.isEmpty()== false)
{
strComm.replace("'","\\'");
strComm.replace("\"","\\\"");
strComm = strComm.trimmed();
//cout << strComm.toStdString() << endl;
QSqlQuery query;
if(strId.length() > 0)
{
query.prepare(QString("insert into " + m_strTable + " (article_id,article_url,platform_id,article_nickname,article_data,article_date,reply_url,article_order,platform_name,platform_form,article_form) VALUES (:ID,:URL,:PLATFORMID,:NICK,:DATA,:DATE,:URLREPLY,:ROWNUM,'naver','blog','reply')").toUtf8());
query.bindValue(":ID", strId.toUtf8());
}
else
query.prepare(QString("insert into " + m_strTable + " (article_url,platform_id,article_nickname,article_data,article_date,reply_url,article_order,platform_name,platform_form,article_form) VALUES (:URL,:PLATFORMID,:NICK,:DATA,:DATE,:URLREPLY,:ROWNUM,'naver','blog','reply')").toUtf8());
/*
cout << "m_strTable = " << m_strTable.toStdString() << endl;
cout << "url = " << strUrl.toStdString() << endl << "nickname = " << strNick.toStdString() << endl;
cout << "data = " << strComm.toStdString() << endl;
cout << "date = " << strDate.toStdString() << endl;
cout << "urlReply = " << m_strUrl.toStdString() << endl;
cout << "ronum = " << nCount << endl;
*/
query.bindValue(":URL", strUrl.toUtf8());
query.bindValue(":PLATFORMID",strUrl.split("/").at(3).toUtf8());
query.bindValue(":NICK",strNick.toUtf8());
query.bindValue(":DATA",strComm.toUtf8());
query.bindValue(":DATE",strDate.toUtf8());
//query.bindValue(":PARENT",QString("NULL").toUtf8());
query.bindValue(":URLREPLY",m_strUrl.toUtf8());
query.bindValue(":ROWNUM",(nCount++));
if (query.exec()==false)
cout << "error : " << query.lastError().text().toStdString();
}
}
if (element.attribute("class") == "reply _countableComment ")
{
strNick = Find(element,"a","class","nick pcol2").toPlainText();
strDate = Find(element,"span","class","date fil5 pcol2").toPlainText();
QWebElement subElement = Find(element,"dd","class","comm pcol2");
QString subNick = Find(subElement,"a","class","nick pcol2").toPlainText();
strComm = subElement.toPlainText();
strCommUrl = Find(element,"a","class","nick pcol2").attribute("href");
if(strCommUrl.left(QString("http://blog.naver.com").length()).compare("http://blog.naver.com") == 0)
strId = strCommUrl.split("/").at(3).trimmed();
if(strCommUrl.right(QString("blog.me").length()).compare("blog.me") == 0)
strId = strCommUrl.split("/").at(2).split(".").at(0).trimmed();
if(strCommUrl.left(1) == "/")
{
QStringList strList = strCommUrl.split("&");
foreach(QString str, strList)
{
if(str.left(3) == "id=")
{
strId = str.right(str.length() - 3);
}
}
}
if(subNick.isEmpty() == false)
{
//strNick = strParent;
strComm = strComm.right(strComm.size()-subNick.size()-1);
}
if (strComm.isEmpty() == false)
{
strComm = GetSafeUtf(strComm);
strComm.replace("'","\\'");
strComm.replace("\"","\\\"");
strComm = strComm.trimmed();
QSqlQuery query;
/*
cout << "m_strTable = " << m_strTable.toStdString() << endl;
cout << "url = " << strUrl.toStdString() << endl << "nickname = " << strNick.toStdString() << endl;
cout << "data = " << strComm.toStdString() << endl;
cout << "date = " << strDate.toStdString() << endl;
cout << "urlReply = " << m_strUrl.toStdString() << endl;
cout << "ronum = " << nCount << endl;
cout << "parent = " << strParent.toStdString() << endl;
*/
if(strId.length() > 0)
{
query.prepare(QString("insert into " + m_strTable + " (article_id,article_url,platform_id,article_nickname,article_data,article_date,article_parent,reply_url,article_order,platform_name,platform_form,article_form) VALUES (:ID,:URL,:PLATFORMID,:NICK,:DATA,:DATE,:PARENT,:URLREPLY,:ROWNUM,'naver','blog','reply')").toUtf8());
query.bindValue(":ID", strId.toUtf8());
}
else
query.prepare(QString("insert into " + m_strTable + " (article_url,platform_id,article_nickname,article_data,article_date,article_parent,reply_url,article_order,platform_name,platform_form,article_form) VALUES (:URL,:PLATFORMID,:NICK,:DATA,:DATE,:PARENT,:URLREPLY,:ROWNUM,'naver','blog','reply')").toUtf8());
query.bindValue(":URL",strUrl.toUtf8());
query.bindValue(":PLATFORMID",strUrl.split("/").at(3).toUtf8());
query.bindValue(":NICK",strNick.toUtf8());
query.bindValue(":DATA",strComm.toUtf8());
query.bindValue(":DATE",strDate.toUtf8());
query.bindValue(":PARENT",strParent.toUtf8());
query.bindValue(":URLREPLY",m_strUrl.toUtf8());
query.bindValue(":ROWNUM",(nCount++));
if (query.exec()==false)
{
cout << "error : " << query.lastError().text().toStdString();
}
}
}
}
}
void SCrawler::saveFrameCafeList(QWebFrame *frame)
{
if (m_bUse == true) return;
static int cz = 0;
//Debug(frame->frameName() + QString::number(cz++) + ".html",frame->toHtml());
QWebElement notFound = Find(frame->documentElement(),"div","id","notfound");
if(notFound.isNull() == false)
{
m_bLast = true;
return;
}
QStringList urlList;
QWebElement eleMain = Find(frame->documentElement(),"div","class","cafe_article section _cafeBase");
foreach(QWebElement eleSub,eleMain.findAll("li"))
{
if (eleSub.attribute("class") == "sh_cafe_top")
{
QString strUrl;
foreach(QWebElement eleSubUrl,eleSub.findAll("a"))
{
if (eleSubUrl.attribute("class") == "url")
strUrl = eleSubUrl.attribute("href");
}
if (strUrl.split("/").at(2) == "cafe.naver.com")
{
urlList << strUrl;
}
}
}
if(urlList.size() > 0)
{
QString strUrlList;
strUrlList = "(";
foreach(QString str, urlList)
{
strUrlList += "'";
strUrlList += str;
strUrlList += "',";
}
strUrlList = strUrlList.left(strUrlList.size() - 1);
strUrlList += ")";
QSqlQuery sql;
QString strQuery = "delete from ";
strQuery += m_strTable;
strQuery += QString(" where article_url in %1").arg(strUrlList);
//qDebug() << strQuery;
if (sql.exec(strQuery.toUtf8()) == false)
{
cout << "error " << sql.lastError().text().toStdString();
cout << strQuery.toStdString();
}
}
foreach(QWebElement eleSub,eleMain.findAll("li"))
{
if (eleSub.attribute("class") == "sh_cafe_top")
{
QString strUrl,strTitle;
foreach(QWebElement eleSubUrl,eleSub.findAll("a"))
{
if (eleSubUrl.attribute("class") == "url")
strUrl = eleSubUrl.attribute("href");
if (eleSubUrl.attribute("class") == "sh_cafe_title")
strTitle = GetSafeUtf(SqlString(eleSubUrl.toPlainText().trimmed()));
}
if (strUrl.split("/").at(2) == "cafe.naver.com")
{
//QSqlQuery sql;
//if (sql.size() == 0 || sql.size() == -1)
{
/*
QString strQuery = QString("insert into ");
strQuery += m_strTable;
strQuery += QString(" set platform_name='naver',platform_form='cafe',article_form='body',article_url='%1',platform_id='%2',article_title='%3',keyword_id='%4'").arg(strUrl).arg(strUrl.split("/").at(3)).arg(strTitle).arg(m_strKeywordID);
QString strUtf8(strQuery.toUtf8());
if (sql.exec(strUtf8) == false)
cout << "x " << sql.lastError().text().toStdString();
else
*/
{
cout << "o " << strUrl.toStdString() << endl;
}
}
//else
// cout << "v " << strUrl.toStdString() << endl;
}
m_bUse = true;
}
}
{
QWebElement total = Find(eleMain,"span","class","title_num");
if (total.toPlainText().isEmpty()) {m_bError = true; return;}
total.toPlainText().split("/").size();
int nTotal = GetNumber(total.toPlainText().split("/").at(1));
QStringList strList = m_strUrl.split("&");
int nNow = GetNumber(strList.at(strList.size() - 1).split("=").at(1));
if ((nNow + 10) > nTotal || nNow >= 1000)
m_bLast = true;
}
}
void SCrawler::saveFrameCafeUrl(QWebFrame *frame)
{
if (m_bUse) return;
QWebElement other = Find(frame->documentElement(),"h1","class","d-none");
if (other.toPlainText().isEmpty() == false)
{
bodydata.setData(SqlString(GetSafeUtf(other.toPlainText())), bodydata.PLATFORM_TITLE);
}
if (frame->frameName() == "cafe_main")
{
{
QString strData,strDate,strNick,strID,strHits,strTitle;
{
QWebElement group = Find(frame->documentElement(),"div","class","tbody m-tcol-c");
strData = SqlString(group.toPlainText().trimmed());
strData = GetSafeUtf(strData);
}
{
QWebElement group = Find(frame->documentElement(),"td","class","m-tcol-c date");
strDate = group.toPlainText().trimmed().replace(".","-");
if (strDate.isEmpty() == true)
{
QWebElement subgroup = Find(frame->documentElement(),"em","class","date m-tcol-c");
strDate = subgroup.toPlainText().trimmed().replace(".","-");
strDate += " 00:00:00";
}
else
strDate += ":00";
}
{
QWebElement group = Find(frame->documentElement(),"span","class","b m-tcol-c");
strTitle = SqlString(group.toPlainText().trimmed());
}
{
QWebElement group = Find(Find(frame->documentElement(),"div","class","etc-box"),"td","class","p-nick");
strNick = group.toPlainText().trimmed();
if (strNick.isEmpty() == false)
{
QStringList list = strNick.split("(");
if (list.isEmpty() == false)
strNick = list.at(0);
QWebElement id = Find(frame->documentElement(),"td","class","m-tcol-c b nick");
list = Find(id,"a","class","m-tcol-c b").attribute("onclick").trimmed().split(",");
if (list.size() >= 2)
strID = list.at(1).trimmed().replace("'","");
}
else
{
QStringList list = Find(Find(frame->documentElement(),"td","class","m-tcol-c b nick"),"a","class","m-tcol-c b").attribute("onclick").trimmed().split(",");
if (list.size() >= 4)
{
strID = list.at(1).trimmed().replace("'","");
strNick = list.at(3).trimmed().replace("'","");
}
}
}
strHits = Find(frame->documentElement(),"span","class","b m-tcol-c reply _rosReadcount").toPlainText();
if (strHits.isEmpty())
{
strHits = Find(frame->documentElement(),"span","class","kin_count m-tcol-c _rosReadcount").toPlainText();
}
{
bodydata.setTable(m_strTable);
bodydata.setData(strData, bodydata.ARTICLE_DATA);
bodydata.setData(strDate, bodydata.ARTICLE_DATE);
bodydata.setData(strNick, bodydata.ARTICLE_NICKNAME);
bodydata.setData(strID, bodydata.ARTICLE_ID);
bodydata.setData(strHits, bodydata.ARTICLE_HIT);
bodydata.setData(m_strUrl, bodydata.ARTICLE_URL);
bodydata.setData("naver", bodydata.PLATFORM_NAME);
bodydata.setData("cafe", bodydata.PLATFORM_FORM);
bodydata.setData("body", bodydata.ARTICLE_FORM);
bodydata.setData(m_strUrl.split("/").at(3), bodydata.PLATFORM_ID);
bodydata.setData(m_strKeywordID, bodydata.KEYWORD_ID);
bodydata.setData(strTitle, bodydata.ARTICLE_TITLE);
/*
QSqlQuery sql;
QString strQuery = "update ";
strQuery += m_strTable;
strQuery += " set ";
strQuery += "article_data = '" + strData + "',";
strQuery += "article_date = '" + strDate + "',";
strQuery += "article_nickname = '" + strNick + "',";
strQuery += "article_id = '" + strID + "',";
strQuery += "article_hit = '" + strHits + "'";
strQuery += "where article_url='";
strQuery += m_strUrl;
strQuery += "'";
QString strUtf8(strQuery.toUtf8());
if (sql.exec(strUtf8) == false)
cout << "error : " << sql.lastError().text().toStdString();
*/
}
}
// Comment
{
QWebElement group = Find(frame->documentElement(),"ul","id","cmt_list");
QWebElementCollection elements = group.findAll("li");
QString strParent;
int nCount = 0;
foreach (QWebElement element, elements)
{
if (element.attribute("class").isEmpty())
{
QString strData = SqlString(Find(element,"span","class","comm_body").toPlainText().trimmed());
if (strData.isEmpty()) continue;
strData = GetSafeUtf(strData);
QString strID = Find(element,"input","name","writerid").attribute("value").trimmed();
QString strNick = Find(element,"td","class","p-nick").toPlainText().trimmed();
strParent = strNick;
QString strDate = Find(element,"span","class","date m-tcol-c filter-50").toPlainText().trimmed();
if (strDate.isEmpty()) continue;
QSqlQuery query;
query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,platform_id,article_id,article_nickname,article_data,article_date,reply_url,article_order) VALUES ('naver','cafe','reply',:URL,:PLATFORMID,:ID,:NICK,:DATA,:DATE,:URLREPLY,:ROWNUM)").toUtf8());
query.bindValue(":URL",m_strUrl.toUtf8());
query.bindValue(":PLATFORMID",m_strUrl.split("/").at(3).toUtf8());
query.bindValue(":ID",strID.toUtf8());
query.bindValue(":NICK",strNick.toUtf8());
query.bindValue(":DATA",strData.toUtf8());
query.bindValue(":DATE",strDate.toUtf8());
query.bindValue(":URLREPLY",m_strReper.toUtf8());
query.bindValue(":ROWNUM",nCount++);
if (query.exec()==false)
cout << "error : " << query.lastError().text().toStdString();
}
if (element.attribute("class") == "reply")
{
QString strData = Find(element,"span","class","comm_body").toPlainText().trimmed();
if (strData.isEmpty()) continue;
QString strReParent = strParent;
QString strID = Find(element,"input","name","writerid").attribute("value").trimmed();
QString strNick = Find(element,"td","class","p-nick").toPlainText().trimmed();
QString strDate = Find(element,"span","class","date m-tcol-c filter-50").toPlainText().trimmed();
QWebElement eleParent = Find(element,"span","class","re-p-nick");
if (eleParent.toPlainText().isEmpty() == false)
strReParent = eleParent.toPlainText();
QSqlQuery query;
query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,platform_id,article_id,article_nickname,article_data,article_date,article_parent,reply_url,article_order) VALUES ('naver','cafe','reply',:URL,:PLATFORMID,:ID,:NICK,:DATA,:DATE,:PARENT,:URLREPLY,:ROWNUM)").toUtf8());
query.bindValue(":URL",m_strUrl.toUtf8());
query.bindValue(":PLATFORMID",m_strUrl.split("/").at(3).toUtf8());
query.bindValue(":ID",strID.toUtf8());
query.bindValue(":NICK",strNick.toUtf8());
query.bindValue(":DATA",strData.toUtf8());
query.bindValue(":DATE",strDate.toUtf8());
query.bindValue(":PARENT",strReParent.toUtf8());
query.bindValue(":URLREPLY",m_strReper.toUtf8());
query.bindValue(":ROWNUM",nCount++);
if (query.exec()==false)
cout << "error : " << query.lastError().text().toStdString();
}
}
}
m_bUse = true;
}
foreach(QWebFrame *childFrame, frame->childFrames())
saveFrameCafeUrl(childFrame);
}
void SCrawler::saveFrameDaumCafeList(QWebFrame *frame)
{
if (m_bUse == true) return;
static int cz = 0;
// Debug(frame->frameName() + QString::number(cz++) + ".html",frame->toHtml());
//int nLast = 0;
QStringList urlList;
QWebElement eleMain = Find(frame->documentElement(),"div","class","type_fulltext wid_f");
foreach(QWebElement eleSub,eleMain.findAll("div"))
{
if (eleSub.attribute("class") == "wrap_cont")
{
//nLast++;
QString strUrl;
foreach(QWebElement eleSubUrl,eleSub.findAll("a"))
{
if (eleSubUrl.attribute("class") == "f_url")
strUrl = eleSubUrl.attribute("href");
}
if (strUrl.split("/").at(2) == "cafe.daum.net")
{
urlList << strUrl;
}
}
}
if(urlList.size() > 0)
{
QString strUrlList;
strUrlList = "(";
foreach(QString str, urlList)
{
strUrlList += "'";
strUrlList += str;
strUrlList += "',";
}
strUrlList = strUrlList.left(strUrlList.size() - 1);
strUrlList += ")";
QSqlQuery sql;
QString strQuery = "delete from ";
strQuery += m_strTable;
strQuery += QString(" where article_url in %1").arg(strUrlList);
// qDebug() << strQuery;
if (sql.exec(strQuery.toUtf8()) == false)
{
cout << "error " << sql.lastError().text().toStdString();
cout << strQuery.toStdString();
}
}
foreach(QWebElement eleSub,eleMain.findAll("div"))
{
if (eleSub.attribute("class") == "wrap_cont")
{
//nLast++;
QString strUrl,strTitle;
foreach(QWebElement eleSubUrl,eleSub.findAll("a"))
{
if (eleSubUrl.attribute("class") == "f_url")
strUrl = eleSubUrl.attribute("href");
if (eleSubUrl.attribute("class") == "f_link_bu f_l")
strTitle = GetSafeUtf(SqlString(eleSubUrl.toPlainText().trimmed()));
}
if (strUrl.split("/").at(2) == "cafe.daum.net")
{
//QSqlQuery sql;
{
/*
QString strQuery = QString("insert into ");
strQuery += m_strTable;
strQuery += QString(" set platform_name='daum',platform_form='cafe',article_form='body',article_url='%1',platform_id='%2',article_title='%3',keyword_id='%4'").arg(strUrl).arg(strUrl.split("/").at(3)).arg(strTitle).arg(m_strKeywordID);
QString strUtf8(strQuery.toUtf8());
if (sql.exec(strUtf8) == false)
cout << "x " << sql.lastError().text().toStdString();
else
*/
cout << "o " << strUrl.toStdString() << endl;
}
//else
// cout << "v " << strUrl.toStdString() << endl;
}
m_bUse = true;
}
}
{
/*
QWebElement noResult = Find(frame->documentElement(),"div","id","noResult");
if(!noResult.isNull())
m_bLast = true;
*/
if(eleMain.isNull())
m_bLast = true;
}
{
QWebElement noResult = Find(frame->documentElement(),"div","id","noResult");
if(!noResult.isNull())
{
m_bLast = true;
return;
}
}
{
bool b_last = false;
b_last = Find(frame->documentElement(), "div", "class", "result_message mg_cont hide").isNull();
b_last = b_last | !(Find(frame->documentElement(), "div", "class", "result_message mg_cont").isNull());
QWebElement total = Find(eleMain,"span","class","f_nb f_l");
if (total.toPlainText().isEmpty()) {m_bError = true; return;}
QString strTotal = total.toPlainText().split("/").at(1);
strTotal = strTotal.replace(",","");
QRegExp rx("(\\d+)");
int pos = 0;
QList<QString> list;
while ((pos = rx.indexIn(strTotal, pos)) != -1)
{
list << rx.cap(1);
pos += rx.matchedLength();
}
int nTotal = list.at(0).toInt();
QStringList strList = total.toPlainText().split("/").at(0).trimmed().split("-");
int nNow = GetNumber(strList.at(strList.size() - 1));
int nNowFirst = GetNumber(strList.at(strList.size() - 2));
if (nNow >= 1000 || nNow >= nTotal || (nNow - nNowFirst) < 9 || b_last)
m_bLast = true;
//cout << "nNow : " << nNow << endl << "nNow - nNowFirst: " << (nNow - nNowFirst) << endl << "b_last : " << b_last << endl;
}
}
void SCrawler::saveFrameDaumCafeUrl(QWebFrame *frame)
{
if (m_bUse) return;
QWebElement other = frame->documentElement().findFirst("title");
QString strTitle = other.toPlainText().trimmed().split("|").at(0).trimmed();
if (strTitle.isEmpty() == false)
{
bodydata.setTable(m_strTable);
bodydata.setData(m_strUrl, bodydata.ARTICLE_URL);
bodydata.setData(SqlString(GetSafeUtf(strTitle)), bodydata.PLATFORM_TITLE);
/*
QString strQuery = "update ";
strQuery += m_strTable;
strQuery += " set ";
strQuery += "platform_title = '" + SqlString(GetSafeUtf(strTitle)) + "'";
strQuery += "where article_url='";
strQuery += m_strUrl;
strQuery += "'";
QString strUtf8(strQuery.toUtf8());
QSqlQuery sql;
if (sql.exec(strUtf8) == false)
cout << "error : " << sql.lastError().text().toStdString();
*/
}
if (frame->frameName() == "down")
{
QString strHits;
{
//QString strData,strDate,strNick,strID,strHits;
QString strData,strDate,strNick,strID,strTitle;
{
QWebElement group = Find(frame->documentElement(),"div","class","bbs_contents");
strData = SqlString(group.toPlainText().trimmed());
strData = GetSafeUtf(strData);
}
{
QWebElement group = Find(frame->documentElement(),"span","class","p11 ls0");
strDate = group.toPlainText().trimmed().replace(".","-");
strDate = strDate.replace("- "," ");
if (strDate.isEmpty() == true)
{
//QWebElement subgroup = Find(frame->documentElement(),"input","name","PLAIN_REGDT").attribute("value");
//strDate = subgroup.toPlainText().trimmed();
strDate = Find(frame->documentElement(),"input","name","PLAIN_REGDT").attribute("value");
//strDate += " 00:00:00";
}
else
strDate += ":00";
}
{
QWebElement group = Find(frame->documentElement(),"div","class","subject");
QWebElement group2 = Find(group,"span","class","b");
strTitle = SqlString(group2.toPlainText().trimmed());
}
{
QWebElement group = Find(Find(frame->documentElement(),"div","class","article_writer"),"a","href","#");
strNick = group.toPlainText().trimmed();
QWebElement id = Find(frame->documentElement(),"div","class","article_writer");
QStringList list = FindLeft(id,"a","onclick","showSideView").attribute("onclick").trimmed().split(",");
if (list.size() >= 2)
strID = list.at(1).trimmed().replace("'","");
}
QStringList strList = Find(frame->documentElement(),"div","class","article_writer").toPlainText().split("|");
foreach(QString str,strList)
{
QStringList substrList = str.split(" ");
for(int i = 0;i < substrList.size();i++)
{
if((substrList.at(i).trimmed().compare("조회") == 0) || (substrList.at(i).trimmed().compare("\"조회\"") == 0))
{
strHits = substrList.at(i+1).trimmed();
break;
}
}
}
/*
if (strHits.isEmpty())
{
strHits = Find(frame->documentElement(),"span","class","kin_count m-tcol-c _rosReadcount").toPlainText();
}
*/
{
bodydata.setTable(m_strTable);
bodydata.setData(strData, bodydata.ARTICLE_DATA);
bodydata.setData(strDate, bodydata.ARTICLE_DATE);
bodydata.setData(strNick, bodydata.ARTICLE_NICKNAME);
if(!strID.isEmpty())
bodydata.setData(strID, bodydata.ARTICLE_ID);
bodydata.setData(strHits, bodydata.ARTICLE_HIT);
bodydata.setData(m_strUrl, bodydata.ARTICLE_URL);
bodydata.setData("daum", bodydata.PLATFORM_NAME);
bodydata.setData("cafe", bodydata.PLATFORM_FORM);
bodydata.setData("body", bodydata.ARTICLE_FORM);
bodydata.setData(m_strUrl.split("/").at(3), bodydata.PLATFORM_ID);
bodydata.setData(m_strKeywordID, bodydata.KEYWORD_ID);
bodydata.setData(strTitle, bodydata.ARTICLE_TITLE);
/*
QSqlQuery sql;
QString strQuery = "update ";
strQuery += m_strTable;
strQuery += " set ";
strQuery += "article_data = '" + strData + "',";
strQuery += "article_date = '" + strDate + "',";
strQuery += "article_nickname = '" + strNick + "',";
if(!strID.isEmpty())
strQuery += "article_id = '" + strID + "',";
strQuery += "article_hit = '" + strHits + "'";
strQuery += "where article_url='";
strQuery += m_strUrl;
strQuery += "'";
QString strUtf8(strQuery.toUtf8());
if (sql.exec(strUtf8) == false)
cout << "error : " << sql.lastError().text().toStdString();
*/
}
}
// Comment
{
QWebElement group = Find(frame->documentElement(),"div","class","commentDiv bg_sub");
QList<QWebElement> elements = FindAllMid(group,"div","id","_cmt-",0,5);
/*
* foreach(QWebElement element, elements)
cout << "element = " << element.toPlainText().toStdString() << endl;
*/
QString commHidden = "comment_hidden";
QString commPos = "comment_pos";
QString commReComm = "recomment_pos";
QString strParent;
int nCount = 0;
foreach (QWebElement element, elements)
{
if (element.attribute("class").trimmed().right(commHidden.length()).compare(commHidden,Qt::CaseInsensitive) != 0){
if (element.attribute("class").trimmed().left(commPos.length()).compare(commPos,Qt::CaseInsensitive) == 0)
{
QString strData = SqlString(Find(element,"span","class","comment_contents").toPlainText().trimmed());
if (strData.isEmpty()) continue;
strData = GetSafeUtf(strData);
QString strID;
QStringList strListID = Find(element,"a","class","b").attribute("onclick").split(",");
if(strListID.length() > 2)
strID = strListID.at(1).trimmed().replace("'","");
QString strNick = Find(element,"a","class","b").toPlainText().trimmed();
strParent = strNick;
QString strDatetest = Find(element,"span","class","comment_date txt_sub p11 ls0").toPlainText().trimmed();
QString strDate;
if(strDatetest.count(".") == 0)
{
strDate = QDateTime::currentDateTime().toString("yyyy-MM-dd");
strDate += (" " + strDatetest);
}
else
{
strDate = Find(element,"span","class","comment_date txt_sub p11 ls0").toPlainText().trimmed().replace(".","-").replace("- "," ");
}
if (strDate.isEmpty()) continue;
else strDate += ":00";
QSqlQuery query;
query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,article_order,platform_id,article_hit,platform_title) VALUES ('daum','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:ROWNUM,:PLATFORMID,:HITS,:TITLE)").toUtf8());
query.bindValue(":URL",m_strUrl.toUtf8());
query.bindValue(":ID",strID.toUtf8());
query.bindValue(":NICK",strNick.toUtf8());
query.bindValue(":DATA",strData.toUtf8());
query.bindValue(":DATE",strDate.toUtf8());
//query.bindValue(":URLREPLY",m_strReper.toUtf8());
query.bindValue(":ROWNUM",nCount++);
query.bindValue(":PLATFORMID",m_strUrl.split("/").at(3).toUtf8());
query.bindValue(":HITS",strHits.toUtf8());
query.bindValue(":TITLE",strTitle.toUtf8());
if (query.exec()==false)
cout << "error : " << query.lastError().text().toStdString();
}
if (element.attribute("class").trimmed().left(commReComm.length()).compare(commReComm,Qt::CaseInsensitive) == 0)
{
QString strData = SqlString(Find(element,"span","class","comment_contents").toPlainText().trimmed());
if (strData.isEmpty()) continue;
QString strReParent = Find(element,"span","class","mention_nicknames text_counter txt_point b").toPlainText().trimmed();
if(strReParent.length() == 0)
strReParent = strParent;
QString strID;
QStringList strListID = Find(element,"a","class","b").attribute("onclick").split(",");
if(strListID.length() > 2)
strID = strListID.at(1).trimmed().replace("'","");
QString strNick = Find(element,"a","class","b").toPlainText().trimmed();
QString strDatetest = Find(element,"span","class","comment_date txt_sub p11 ls0").toPlainText().trimmed();
QString strDate;
if(strDatetest.count(".") == 0)
{
strDate = QDateTime::currentDateTime().toString("yyyy-MM-dd");
strDate += (" " + strDatetest);
}
else
{
strDate = Find(element,"span","class","comment_date txt_sub p11 ls0").toPlainText().trimmed().replace(".","-").replace("- "," ");
}
if (strDate.isEmpty()) continue;
else strDate += ":00";
QSqlQuery query;
query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,article_parent,article_order,platform_id,article_hit,platform_title) VALUES ('daum','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:PARENT,:ROWNUM,:PLATFORMID,:HITS,:TITLE)").toUtf8());
query.bindValue(":URL",m_strUrl.toUtf8());
query.bindValue(":ID",strID.toUtf8());
query.bindValue(":NICK",strNick.toUtf8());
query.bindValue(":DATA",strData.toUtf8());
query.bindValue(":DATE",strDate.toUtf8());
query.bindValue(":PARENT",strReParent.toUtf8());
//query.bindValue(":URLREPLY",m_strReper.toUtf8());
query.bindValue(":ROWNUM",nCount++);
query.bindValue(":PLATFORMID",m_strUrl.split("/").at(3).toUtf8());
query.bindValue(":HITS",strHits.toUtf8());
query.bindValue(":TITLE",strTitle.toUtf8());
//QWebView::page()->mainFrame()->evaluateJavaScript("");
if (query.exec()==false)
cout << "error : " << query.lastError().text().toStdString();
}
}
}
}
m_bUse = true;
}
foreach(QWebFrame *childFrame, frame->childFrames())
saveFrameDaumCafeUrl(childFrame);
}
void SCrawler::saveFrameDaumBlogUrl(QWebFrame *frame){}
void SCrawler::saveFrameDaumBlogComment(QWebFrame *frame){}
void SCrawler::saveFrameDaumBlogList(QWebFrame *frame){}
QWebElement SCrawler::Find(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind)
{
QWebElementCollection elements = _FindElement.findAll(_strElement);
foreach (QWebElement element, elements)
{
if (element.attribute(_strAttrib) == _strFind)
{
return element;
}
}
QWebElement element;
return element;
}
QWebElement SCrawler::FindMid(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind, const int _strStart, const int _strLength)
{
QWebElementCollection elements = _FindElement.findAll(_strElement);
foreach (QWebElement element, elements)
{
QString str = element.attribute(_strAttrib).trimmed().mid(_strStart,_strLength);
if (QString::compare(str,_strFind,Qt::CaseInsensitive)==0)
{
return element;
}
}
QWebElement element;
return element;
}
QWebElement SCrawler::FindMid(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind, const int _strStart)
{
int _strLength = _strFind.length();
return FindMid(_FindElement, _strElement, _strAttrib, _strFind, _strStart, _strLength);
}
QWebElement SCrawler::FindRight(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind)
{
QWebElementCollection elements = _FindElement.findAll(_strElement);
foreach (QWebElement element, elements)
{
QString str = element.attribute(_strAttrib).trimmed().right(_strFind.length());
cout << "FindRight : " << str.toStdString() << endl;
cout << "FindRight right : " << _strFind.toStdString() << endl;
if (QString::compare(str,_strFind,Qt::CaseInsensitive)==0)
{
return element;
}
}
QWebElement element;
return element;
}
QWebElement SCrawler::FindLeft(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind)
{
int _strStart = 0;
return FindMid(_FindElement, _strElement, _strAttrib, _strFind, _strStart);
}
QList<QWebElement> SCrawler::FindAllMid(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind, const int _strStart, const int _strLength)
{
QWebElementCollection elements = _FindElement.findAll(_strElement);
QList<QWebElement> returnElements = QList<QWebElement>();
foreach (QWebElement element, elements)
{
QString str = element.attribute(_strAttrib).trimmed().mid(_strStart,_strLength);
if (QString::compare(str,_strFind,Qt::CaseInsensitive)==0)
{
returnElements.append(element);
}
}
return returnElements;
}
bool SCrawler::getProxyList(QString &_str)
{
QSqlQuery sqlquery;
QString strquery = "select proxy, port from Proxy limit 300";
QString queryutf = strquery.toUtf8();
if(sqlquery.exec(queryutf) == false)
{
return false;
}
while(sqlquery.next())
{
QString str = sqlquery.value(0).toString();
str += ",";
str += sqlquery.value(1).toString();
str += "\n";
_str += str;
}
return true;
}
void SCrawler::setProxy()
{
QString proxyList;
if (getProxyList(proxyList))
{
QVector <QStringList> vecProxy;
QStringList strListProxy = proxyList.split("\n");
foreach(QString str, strListProxy)
{
str = str.trimmed();
if (str.isEmpty()) continue;
vecProxy.push_back(str.split(","));
}
if (vecProxy.size() > 0)
{
QStringList strList = vecProxy.at(rand()%vecProxy.size());
switch(strList.size())
{
case 1:
cout << "p : " << strList.at(0).toStdString() << endl;
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0))));
break;
case 2:
cout << "p : " << strList.at(0).toStdString() << ":" << strList.at(1).toStdString() << endl;
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt())));
m_strProxyIP = strList.at(0);
m_nProxyPort = strList.at(1).toInt();
//QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,"196.201.216.172",8088)));
break;
}
}
else
{
QFile file("proxy.txt");
if (file.open(QIODevice::ReadOnly | QIODevice::Text))
{
QVector <QStringList> vecProxy;
while (!file.atEnd())
{
QString str = QString(file.readLine());
if (str.isEmpty()) continue;
vecProxy.push_back(str.split(","));
}
if (vecProxy.size() > 0)
{
QStringList strList = vecProxy.at(rand()%vecProxy.size());
switch(strList.size())
{
case 1:
cout << "p : " << strList.at(0).toStdString() << endl;
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0))));
break;
case 2:
cout << "p : " << strList.at(0).toStdString() << endl;
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt())));
break;
}
}
}
}
}
else
{
QFile file("proxy.txt");
if (file.open(QIODevice::ReadOnly | QIODevice::Text))
{
QVector <QStringList> vecProxy;
while (!file.atEnd())
{
QString str = QString(file.readLine());
if (str.isEmpty()) continue;
vecProxy.push_back(str.split(","));
}
if (vecProxy.size() > 0)
{
QStringList strList = vecProxy.at(rand()%vecProxy.size());
switch(strList.size())
{
case 1:
cout << "p : " << strList.at(0).toStdString() << endl;
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0))));
break;
case 2:
cout << "p : " << strList.at(0).toStdString() << endl;
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt())));
break;
}
}
}
}
}
void SCrawler::deleteProxy()
{
if (m_strProxyIP.isEmpty()) return;
QSqlQuery sqlquery;
QString strquery = "delete from Proxy where proxy = '" + m_strProxyIP + "' and port = " + QString::number(m_nProxyPort);
if(sqlquery.exec(strquery.toUtf8()) == false)
cout << "Error : " << strquery.toStdString() << endl;
}