다음카페크롤러 추가
네이버 블로그 크롤러 blog.me 기능 추가 git-svn-id: svn://192.168.0.12/source@47 8346c931-da38-4b9b-9d4c-e48b93cbd075
This commit is contained in:
@@ -121,6 +121,101 @@ void SCrawler::load(QStringList _strlistArgv)
|
|||||||
m_strTable = "data_" + _strlistArgv[3];
|
m_strTable = "data_" + _strlistArgv[3];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if (_strlistArgv[0] == "daum")
|
||||||
|
{
|
||||||
|
if (_strlistArgv[1] == "cafe_list")
|
||||||
|
{
|
||||||
|
m_strUrl = _strlistArgv[2];
|
||||||
|
m_nSelect = E_DAUM_CAFE_LIST;
|
||||||
|
m_strKeywordID = _strlistArgv[4];
|
||||||
|
|
||||||
|
QFile file("proxy.txt");
|
||||||
|
if (file.open(QIODevice::ReadOnly | QIODevice::Text))
|
||||||
|
{
|
||||||
|
QVector <QStringList> vecProxy;
|
||||||
|
while (!file.atEnd())
|
||||||
|
{
|
||||||
|
QString str = QString(file.readLine());
|
||||||
|
if (str.isEmpty()) continue;
|
||||||
|
vecProxy.push_back(str.split(","));
|
||||||
|
}
|
||||||
|
if (vecProxy.size() > 0)
|
||||||
|
{
|
||||||
|
QStringList strList = vecProxy.at(rand()%vecProxy.size());
|
||||||
|
switch(strList.size())
|
||||||
|
{
|
||||||
|
case 1:
|
||||||
|
cout << "p : " << strList.at(0).toStdString() << endl;
|
||||||
|
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0))));
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
cout << "p : " << strList.at(0).toStdString() << endl;
|
||||||
|
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt())));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (_strlistArgv[1] == "cafe_data")
|
||||||
|
{
|
||||||
|
m_strUrl = _strlistArgv[2];
|
||||||
|
m_nSelect = E_DAUM_CAFE_DATA;
|
||||||
|
m_strReper = _strlistArgv[4];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (_strlistArgv[1] == "blog_list")
|
||||||
|
{
|
||||||
|
m_strUrl = _strlistArgv[2];
|
||||||
|
m_nSelect = E_DAUM_BLOG_LIST;
|
||||||
|
m_strKeywordID = _strlistArgv[4];
|
||||||
|
//cout << "ok";
|
||||||
|
|
||||||
|
QFile file("proxy.txt");
|
||||||
|
if (file.open(QIODevice::ReadOnly | QIODevice::Text))
|
||||||
|
{
|
||||||
|
QVector <QStringList> vecProxy;
|
||||||
|
while (!file.atEnd())
|
||||||
|
{
|
||||||
|
QString str = QString(file.readLine());
|
||||||
|
if (str.isEmpty()) continue;
|
||||||
|
vecProxy.push_back(str.split(","));
|
||||||
|
}
|
||||||
|
if (vecProxy.size() > 0)
|
||||||
|
{
|
||||||
|
QStringList strList = vecProxy.at(rand()%vecProxy.size());
|
||||||
|
switch(strList.size())
|
||||||
|
{
|
||||||
|
case 1:
|
||||||
|
cout << "p : " << strList.at(0).toStdString() << endl;
|
||||||
|
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0))));
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
cout << "p : " << strList.at(0).toStdString() << endl;
|
||||||
|
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt())));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (_strlistArgv[1] == "blog_url")
|
||||||
|
{
|
||||||
|
m_strUrl = _strlistArgv[2];
|
||||||
|
m_nSelect = E_DAUM_BLOG_BODY;
|
||||||
|
//m_strReper = _strlistArgv[4];
|
||||||
|
m_bUse = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (_strlistArgv[1] == "blog_comm")
|
||||||
|
{
|
||||||
|
m_strUrl = _strlistArgv[2];
|
||||||
|
m_nSelect = E_DAUM_BLOG_REPLY;
|
||||||
|
}
|
||||||
|
m_strTable = "data_" + _strlistArgv[3];
|
||||||
|
}
|
||||||
|
|
||||||
cout << m_strUrl.toStdString() << endl;
|
cout << m_strUrl.toStdString() << endl;
|
||||||
|
|
||||||
QUrl url = QUrl(m_strUrl);
|
QUrl url = QUrl(m_strUrl);
|
||||||
@@ -174,20 +269,30 @@ void SCrawler::saveResult(bool ok)
|
|||||||
case E_NAVER_BLOG_LIST:saveFrameList(m_page->mainFrame());break;
|
case E_NAVER_BLOG_LIST:saveFrameList(m_page->mainFrame());break;
|
||||||
case E_NAVER_BLOG_BODY:saveFrameUrl(m_page->mainFrame());break;
|
case E_NAVER_BLOG_BODY:saveFrameUrl(m_page->mainFrame());break;
|
||||||
case E_NAVER_BLOG_REPLY:saveFrameComment(m_page->mainFrame());break;
|
case E_NAVER_BLOG_REPLY:saveFrameComment(m_page->mainFrame());break;
|
||||||
|
case E_DAUM_CAFE_LIST:saveFrameDaumCafeList(m_page->mainFrame());break;
|
||||||
|
case E_DAUM_CAFE_DATA:saveFrameDaumCafeUrl(m_page->mainFrame());break;
|
||||||
|
case E_DAUM_BLOG_LIST:saveFrameDaumBlogList(m_page->mainFrame());break;
|
||||||
|
case E_DAUM_BLOG_BODY:saveFrameDaumBlogUrl(m_page->mainFrame());break;
|
||||||
|
case E_DAUM_BLOG_REPLY:saveFrameDaumBlogComment(m_page->mainFrame());break;
|
||||||
}
|
}
|
||||||
|
|
||||||
switch(m_nSelect)
|
switch(m_nSelect)
|
||||||
{
|
{
|
||||||
case E_NAVER_CAFE_LIST:
|
case E_NAVER_CAFE_LIST:
|
||||||
case E_NAVER_BLOG_LIST:
|
case E_NAVER_BLOG_LIST:
|
||||||
|
case E_DAUM_CAFE_LIST:
|
||||||
|
case E_DAUM_BLOG_LIST:
|
||||||
if (m_bError) cout << "block";// block
|
if (m_bError) cout << "block";// block
|
||||||
if (m_bLast) cout << "last";
|
if (m_bLast) cout << "last";
|
||||||
break;
|
break;
|
||||||
case E_NAVER_BLOG_REPLY:
|
case E_NAVER_BLOG_REPLY:
|
||||||
|
case E_DAUM_BLOG_REPLY:
|
||||||
cout << "ok";
|
cout << "ok";
|
||||||
break;
|
break;
|
||||||
case E_NAVER_CAFE_DATA:
|
case E_NAVER_CAFE_DATA:
|
||||||
case E_NAVER_BLOG_BODY:
|
case E_NAVER_BLOG_BODY:
|
||||||
|
case E_DAUM_CAFE_DATA:
|
||||||
|
case E_DAUM_BLOG_BODY:
|
||||||
if (m_bUse == false)
|
if (m_bUse == false)
|
||||||
{
|
{
|
||||||
cout << "fail";
|
cout << "fail";
|
||||||
@@ -270,7 +375,19 @@ void SCrawler::saveFrameList(QWebFrame *frame)
|
|||||||
}
|
}
|
||||||
|
|
||||||
QStringList strList = strUrl.split('/');
|
QStringList strList = strUrl.split('/');
|
||||||
if (strList.at(0).compare("blog.naver.com") != 0 ) { cout << "x http://" << strUrl.toStdString() <<endl; continue; };
|
|
||||||
|
QString strBlogMe = "blog.me";
|
||||||
|
|
||||||
|
if ((strList.at(0).compare("blog.naver.com") != 0 ) && (strList.at(0).right(strBlogMe.length()).compare(strBlogMe) != 0)) { cout << "x http://" << strUrl.toStdString() <<endl; continue; };
|
||||||
|
|
||||||
|
if((strList.at(0).right(strBlogMe.length()).compare(strBlogMe) == 0))
|
||||||
|
{
|
||||||
|
QStringList strSubList = strList.at(0).split('.');
|
||||||
|
strUrl = "blog.naver.com/";
|
||||||
|
strUrl += strSubList.at(0);
|
||||||
|
strUrl += "/";
|
||||||
|
strUrl += strList.at(1);
|
||||||
|
}
|
||||||
|
|
||||||
QString strQuery = "select article_url from ";
|
QString strQuery = "select article_url from ";
|
||||||
strQuery += m_strTable;
|
strQuery += m_strTable;
|
||||||
@@ -284,10 +401,16 @@ void SCrawler::saveFrameList(QWebFrame *frame)
|
|||||||
str.replace("'","\\'");
|
str.replace("'","\\'");
|
||||||
str.replace("\"","\\\"");
|
str.replace("\"","\\\"");
|
||||||
str = str.trimmed();
|
str = str.trimmed();
|
||||||
|
QString strPlatformId;
|
||||||
|
|
||||||
|
if(strUrl.split("/").at(0).compare("blog.naver.com") == 0)
|
||||||
|
strPlatformId = strUrl.split("/").at(1);
|
||||||
|
else
|
||||||
|
strPlatformId = strUrl.split("/").at(0).split(".").at(0);
|
||||||
|
|
||||||
QString strQuery = QString("insert into ");
|
QString strQuery = QString("insert into ");
|
||||||
strQuery += m_strTable;
|
strQuery += m_strTable;
|
||||||
strQuery += QString(" set platform_name='naver',platform_form='blog',article_form='body',article_url='%1',platform_id='%2',platform_title='%3',keyword_id='%4'").arg("http://"+strUrl).arg(strUrl.split("/").at(1)).arg(str).arg(m_strKeywordID);
|
strQuery += QString(" set platform_name='naver',platform_form='blog',article_form='body',article_url='%1',platform_id='%2',platform_title='%3',keyword_id='%4'").arg("http://"+strUrl).arg(strPlatformId).arg(str).arg(m_strKeywordID);
|
||||||
QString strUtf8(strQuery.toUtf8());
|
QString strUtf8(strQuery.toUtf8());
|
||||||
if (sql.exec(strUtf8) == false)
|
if (sql.exec(strUtf8) == false)
|
||||||
cout << "error : " << sql.lastError().text().toStdString();
|
cout << "error : " << sql.lastError().text().toStdString();
|
||||||
@@ -324,10 +447,8 @@ enum E_DATA
|
|||||||
|
|
||||||
void SCrawler::saveFrameUrl(QWebFrame *frame)
|
void SCrawler::saveFrameUrl(QWebFrame *frame)
|
||||||
{
|
{
|
||||||
/*
|
|
||||||
static int cz = 0;
|
static int cz = 0;
|
||||||
Debug(frame->frameName() + QString::number(cz++) + ".html",frame->toHtml());
|
// Debug(frame->frameName() + QString::number(cz++) + ".html",frame->toHtml());
|
||||||
*/
|
|
||||||
|
|
||||||
QSqlQuery sql;
|
QSqlQuery sql;
|
||||||
if (frame->frameName().compare(QString("BuddyConnectIframe")) == 0)
|
if (frame->frameName().compare(QString("BuddyConnectIframe")) == 0)
|
||||||
@@ -351,15 +472,43 @@ void SCrawler::saveFrameUrl(QWebFrame *frame)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
/*
|
||||||
|
QString strHtml2 = frame->toHtml();
|
||||||
|
QString strFind2 = "blogpfthumb";
|
||||||
|
int start = strHtml2.indexOf(strFind2);
|
||||||
|
cout << "start = " << start << endl;
|
||||||
|
|
||||||
|
QString str222 = strHtml2.mid(start,30);
|
||||||
|
|
||||||
|
if(start != -1)
|
||||||
|
cout << "start String = " << str222.toStdString() << endl;
|
||||||
|
|
||||||
|
|
||||||
|
QWebElement image2 = Find(frame->documentElement(),"div","id","blog-profile");
|
||||||
|
cout << "p class image = " << image2.toInnerXml().toStdString() << endl;
|
||||||
|
image2 = Find(image2,"a","href","#");
|
||||||
|
image2 = Find(image2,"img","alt","프로필 이미지");
|
||||||
|
cout << "outer image profile = " << image2.attribute("src").toStdString() << endl;
|
||||||
|
image2 = FindMid(frame->documentElement(),"img","src","http://blogpfthumb",0,18);
|
||||||
|
cout << "outer image findmid = " << image2.attribute("src").toStdString() << endl;
|
||||||
|
*/
|
||||||
|
|
||||||
if (frame->frameName().compare(QString("mainFrame")) == 0)
|
if (frame->frameName().compare(QString("mainFrame")) == 0)
|
||||||
{
|
{
|
||||||
QString str[E_DATA_MAX];
|
QString str[E_DATA_MAX];
|
||||||
//QString strHead[E_DATA_MAX] = {"article_nickname","article_id","article_title","article_date","article_data","platform_title"};
|
QString strHead[E_DATA_MAX] = {"article_nickname","article_id","article_title","article_date","article_data","platform_title"};
|
||||||
QString strHead[E_DATA_MAX] = {"article_nickname","article_id","article_title","article_date","article_data"};
|
QString strSympathy;
|
||||||
QWebElement proTitle = Find(frame->documentElement(),"td","id","blogTitleText");
|
QWebElement proTitle = Find(frame->documentElement(),"meta","property","og:article:author");
|
||||||
// str[E_DATA_PLATFORM_TITLE] = proTitle.toPlainText().trimmed();
|
str[E_DATA_PLATFORM_TITLE] = proTitle.attribute("content").split("|").at(1).trimmed();
|
||||||
|
if(str[E_DATA_PLATFORM_TITLE].length() > 0)
|
||||||
|
str[E_DATA_PLATFORM_TITLE] = GetSafeUtf(str[E_DATA_PLATFORM_TITLE]);
|
||||||
|
else
|
||||||
|
{
|
||||||
|
proTitle = Find(frame->documentElement(),"span","id","blogTitleName");
|
||||||
|
str[E_DATA_PLATFORM_TITLE] = GetSafeUtf(proTitle.toPlainText().trimmed());
|
||||||
|
}
|
||||||
|
|
||||||
|
QWebElement image;
|
||||||
QWebElement profile = Find(frame->documentElement(),"div","id","blog-profile");
|
QWebElement profile = Find(frame->documentElement(),"div","id","blog-profile");
|
||||||
{
|
{
|
||||||
QWebElement nick = Find(profile,"strong","id","nickNameArea");
|
QWebElement nick = Find(profile,"strong","id","nickNameArea");
|
||||||
@@ -384,7 +533,8 @@ void SCrawler::saveFrameUrl(QWebFrame *frame)
|
|||||||
int end = strHtml.indexOf("'",start + strFind.length());
|
int end = strHtml.indexOf("'",start + strFind.length());
|
||||||
str[E_DATA_NICK] = strHtml.mid(start + strFind.length(),end-start-strFind.length());
|
str[E_DATA_NICK] = strHtml.mid(start + strFind.length(),end-start-strFind.length());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
str[E_DATA_NICK] = GetSafeUtf(str[E_DATA_NICK]);
|
||||||
if (m_strUrl.split("/").at(3) == str[E_DATA_NICK])
|
if (m_strUrl.split("/").at(3) == str[E_DATA_NICK])
|
||||||
{
|
{
|
||||||
str[E_DATA_ID] = str[E_DATA_NICK];
|
str[E_DATA_ID] = str[E_DATA_NICK];
|
||||||
@@ -399,16 +549,34 @@ void SCrawler::saveFrameUrl(QWebFrame *frame)
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (str[E_DATA_ID].isEmpty())
|
if (str[E_DATA_ID].isEmpty())
|
||||||
str[E_DATA_ID] = m_strUrl.split("/").at(3);
|
{
|
||||||
|
if((m_strUrl.split("/").at(2).compare("blog.naver.com") == 0))
|
||||||
|
str[E_DATA_ID] = m_strUrl.split("/").at(3);
|
||||||
|
else
|
||||||
|
str[E_DATA_ID] = m_strUrl.split("/").at(2).split(".").at(0);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
//QWebElement image = Find(profile,"img","alt","프로필 이미지");
|
if(str[E_DATA_NICK].length() == 0)
|
||||||
|
str[E_DATA_NICK] = str[E_DATA_ID];
|
||||||
|
|
||||||
|
image = Find(profile,"img","alt","프로필 이미지");
|
||||||
|
|
||||||
|
//strSympathy = FindLeft(Find(frame->documentElement(),"p","class","postre"),"a","class","pcol2 _symList").toPlainText().split(" ").at(1);
|
||||||
|
|
||||||
|
/*
|
||||||
|
cout << "inner image = " << image.attribute("src").toStdString() << endl;
|
||||||
|
image = FindMid(profile,"img","src","http://blogpfthumb",0,18);
|
||||||
|
cout << "inner image FindMid = " << image.attribute("src").toStdString() << endl;
|
||||||
|
cout << "str[E_DATA_ID] = " << str[E_DATA_ID].toStdString() << ", str[E_DATA_NICK] = " << str[E_DATA_NICK].toStdString() << endl;
|
||||||
|
*/
|
||||||
|
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
QWebElement post = Find(frame->documentElement(),"div","id","postListBody");
|
QWebElement post = Find(frame->documentElement(),"div","id","postListBody");
|
||||||
QWebElement post_top = Find(post,"table","class","post-top");
|
QWebElement post_top = Find(post,"table","class","post-top");
|
||||||
|
|
||||||
{
|
{
|
||||||
QWebElement title = Find(post_top,"div","class","htitle");
|
QWebElement title = Find(post_top,"span","class","pcol1 itemSubjectBoldfont");
|
||||||
if (title.toPlainText().isEmpty()==false)
|
if (title.toPlainText().isEmpty()==false)
|
||||||
{
|
{
|
||||||
str[E_DATA_TITLE] = title.toPlainText();
|
str[E_DATA_TITLE] = title.toPlainText();
|
||||||
@@ -441,22 +609,29 @@ void SCrawler::saveFrameUrl(QWebFrame *frame)
|
|||||||
}
|
}
|
||||||
|
|
||||||
QString strQuery = "update " + m_strTable + " set ";
|
QString strQuery = "update " + m_strTable + " set ";
|
||||||
for(int i = 0; i < E_DATA_MAX - 1 ; i++)
|
for(int i = 0; i < E_DATA_MAX ; i++)
|
||||||
{
|
{
|
||||||
strQuery += strHead[i];
|
strQuery += strHead[i];
|
||||||
strQuery += "='";
|
strQuery += "='";
|
||||||
//strQuery += GetSafeUtf(SqlString(str[i].trimmed()));
|
//strQuery += GetSafeUtf(SqlString(str[i].trimmed()));
|
||||||
strQuery += str[i].trimmed();
|
strQuery += str[i].trimmed();
|
||||||
strQuery += "'";
|
strQuery += "'";
|
||||||
if( i != (E_DATA_MAX - 2) )
|
if( i != (E_DATA_MAX - 1) )
|
||||||
strQuery += ",";
|
strQuery += ",";
|
||||||
}
|
}
|
||||||
|
if(image.attribute("src").trimmed().length() != 0)
|
||||||
|
{
|
||||||
|
strQuery += ", ";
|
||||||
|
strQuery += "article_profileurl='";
|
||||||
|
strQuery += image.attribute("src").trimmed();
|
||||||
|
strQuery += "'";
|
||||||
|
}
|
||||||
strQuery += " where article_url='";
|
strQuery += " where article_url='";
|
||||||
strQuery += m_strUrl;
|
strQuery += m_strUrl;
|
||||||
strQuery += "'";
|
strQuery += "'";
|
||||||
|
//cout << "strQuery = " << strQuery.toStdString() << endl;
|
||||||
QString strUtf8(strQuery.toUtf8());
|
QString strUtf8(strQuery.toUtf8());
|
||||||
//qDebug() << strQuery;
|
|
||||||
//cout << "Query : " << strQuery.toStdString() << endl;
|
|
||||||
if (sql.exec(strUtf8)==false)
|
if (sql.exec(strUtf8)==false)
|
||||||
{
|
{
|
||||||
cout << "error : " << sql.lastError().text().toStdString();
|
cout << "error : " << sql.lastError().text().toStdString();
|
||||||
@@ -475,16 +650,18 @@ void SCrawler::saveFrameComment(QWebFrame *frame)
|
|||||||
QWebElementCollection elements = group.findAll("li");
|
QWebElementCollection elements = group.findAll("li");
|
||||||
QString strParent,strDate,strNick,strComm,strUrl;
|
QString strParent,strDate,strNick,strComm,strUrl;
|
||||||
QStringList strList = m_strUrl.split("/");
|
QStringList strList = m_strUrl.split("/");
|
||||||
|
|
||||||
for (int i=0; i < 5; i++)
|
for (int i=0; i < 5; i++)
|
||||||
strUrl += strList.at(i) + "/";
|
strUrl += strList.at(i) + "/";
|
||||||
|
|
||||||
strUrl = strUrl.left(strUrl.size()-1);
|
strUrl = strUrl.left(strUrl.size()-1);
|
||||||
|
|
||||||
int nCount=0;
|
int nCount=0;
|
||||||
foreach (QWebElement element, elements)
|
foreach (QWebElement element, elements)
|
||||||
{
|
{
|
||||||
if (element.attribute("class") == "_countableComment ")
|
if (element.attribute("class") == "_countableComment ")
|
||||||
{
|
{
|
||||||
strNick = strParent = Find(element,"a","class","nick pcol2").toPlainText();
|
strNick = strParent = GetSafeUtf(Find(element,"a","class","nick pcol2").toPlainText());
|
||||||
strDate = Find(element,"span","class","date fil5 pcol2").toPlainText();
|
strDate = Find(element,"span","class","date fil5 pcol2").toPlainText();
|
||||||
strComm = Find(element,"dd","class","comm pcol2").toPlainText();
|
strComm = Find(element,"dd","class","comm pcol2").toPlainText();
|
||||||
strComm = GetSafeUtf(strComm);
|
strComm = GetSafeUtf(strComm);
|
||||||
@@ -568,6 +745,7 @@ void SCrawler::saveFrameCafeList(QWebFrame *frame)
|
|||||||
if (m_bUse == true) return;
|
if (m_bUse == true) return;
|
||||||
|
|
||||||
static int cz = 0;
|
static int cz = 0;
|
||||||
|
//
|
||||||
Debug(frame->frameName() + QString::number(cz++) + ".html",frame->toHtml());
|
Debug(frame->frameName() + QString::number(cz++) + ".html",frame->toHtml());
|
||||||
|
|
||||||
QWebElement eleMain = Find(frame->documentElement(),"div","class","cafe_article section _cafeBase");
|
QWebElement eleMain = Find(frame->documentElement(),"div","class","cafe_article section _cafeBase");
|
||||||
@@ -777,6 +955,276 @@ void SCrawler::saveFrameCafeUrl(QWebFrame *frame)
|
|||||||
saveFrameCafeUrl(childFrame);
|
saveFrameCafeUrl(childFrame);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void SCrawler::saveFrameDaumCafeList(QWebFrame *frame)
|
||||||
|
{
|
||||||
|
if (m_bUse == true) return;
|
||||||
|
|
||||||
|
static int cz = 0;
|
||||||
|
// Debug(frame->frameName() + QString::number(cz++) + ".html",frame->toHtml());
|
||||||
|
|
||||||
|
QWebElement eleMain = Find(frame->documentElement(),"div","class","type_fulltext wid_f");
|
||||||
|
foreach(QWebElement eleSub,eleMain.findAll("div"))
|
||||||
|
{
|
||||||
|
if (eleSub.attribute("class") == "wrap_cont")
|
||||||
|
{
|
||||||
|
QString strUrl,strTitle;
|
||||||
|
foreach(QWebElement eleSubUrl,eleSub.findAll("a"))
|
||||||
|
{
|
||||||
|
if (eleSubUrl.attribute("class") == "f_url")
|
||||||
|
strUrl = eleSubUrl.attribute("href");
|
||||||
|
|
||||||
|
if (eleSubUrl.attribute("class") == "f_link_bu f_l")
|
||||||
|
strTitle = GetSafeUtf(SqlString(eleSubUrl.toPlainText().trimmed()));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (strUrl.split("/").at(2) == "cafe.daum.net")
|
||||||
|
{
|
||||||
|
QSqlQuery sql;
|
||||||
|
/*
|
||||||
|
QString strQuery = "select URL from ";
|
||||||
|
strQuery += m_strTableBody;
|
||||||
|
strQuery += QString(" where URL = '%1'").arg(strUrl);
|
||||||
|
sql.exec(strQuery);
|
||||||
|
if (sql.size() == 0)
|
||||||
|
*/
|
||||||
|
{
|
||||||
|
QString strQuery = QString("insert into ");
|
||||||
|
strQuery += m_strTable;
|
||||||
|
strQuery += QString(" set platform_name='daum',platform_form='cafe',article_form='body',article_url='%1',platform_id='%2',article_title='%3',keyword_id='%4'").arg(strUrl).arg(strUrl.split("/").at(3)).arg(strTitle).arg(m_strKeywordID);
|
||||||
|
QString strUtf8(strQuery.toUtf8());
|
||||||
|
if (sql.exec(strUtf8) == false)
|
||||||
|
cout << "x " << sql.lastError().text().toStdString();
|
||||||
|
else
|
||||||
|
cout << "o " << strUrl.toStdString() << endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
m_bUse = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
QWebElement total = Find(eleMain,"span","class","f_nb f_l");
|
||||||
|
if (total.toPlainText().isEmpty()) {m_bError = true; return;}
|
||||||
|
total.toPlainText().split("/").size();
|
||||||
|
QStringList strList = total.toPlainText().split("/").at(0).trimmed().split("-");
|
||||||
|
int nNow = GetNumber(strList.at(strList.size() - 1));
|
||||||
|
int nNowFirst = GetNumber(strList.at(strList.size() - 2));
|
||||||
|
if (nNow >= 1000 || (nNow - nNowFirst) < 9)
|
||||||
|
m_bLast = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void SCrawler::saveFrameDaumCafeUrl(QWebFrame *frame)
|
||||||
|
{
|
||||||
|
if (m_bUse) return;
|
||||||
|
|
||||||
|
|
||||||
|
QWebElement other = frame->documentElement().findFirst("title");
|
||||||
|
QString strTitle = other.toPlainText().trimmed().split("|").at(0).trimmed();
|
||||||
|
|
||||||
|
if (strTitle.isEmpty() == false)
|
||||||
|
{
|
||||||
|
QString strQuery = "update ";
|
||||||
|
strQuery += m_strTable;
|
||||||
|
strQuery += " set ";
|
||||||
|
strQuery += "platform_title = '" + SqlString(GetSafeUtf(strTitle)) + "'";
|
||||||
|
strQuery += "where article_url='";
|
||||||
|
strQuery += m_strUrl;
|
||||||
|
strQuery += "'";
|
||||||
|
QString strUtf8(strQuery.toUtf8());
|
||||||
|
QSqlQuery sql;
|
||||||
|
if (sql.exec(strUtf8) == false)
|
||||||
|
cout << "error : " << sql.lastError().text().toStdString();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (frame->frameName() == "down")
|
||||||
|
{
|
||||||
|
QString strHits;
|
||||||
|
{
|
||||||
|
//QString strData,strDate,strNick,strID,strHits;
|
||||||
|
QString strData,strDate,strNick,strID;
|
||||||
|
{
|
||||||
|
QWebElement group = Find(frame->documentElement(),"div","class","bbs_contents");
|
||||||
|
strData = SqlString(group.toPlainText().trimmed());
|
||||||
|
strData = GetSafeUtf(strData);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
QWebElement group = Find(frame->documentElement(),"span","class","p11 ls0");
|
||||||
|
strDate = group.toPlainText().trimmed().replace(".","-");
|
||||||
|
strDate = strDate.replace("- "," ");
|
||||||
|
if (strDate.isEmpty() == true)
|
||||||
|
{
|
||||||
|
//QWebElement subgroup = Find(frame->documentElement(),"input","name","PLAIN_REGDT").attribute("value");
|
||||||
|
//strDate = subgroup.toPlainText().trimmed();
|
||||||
|
strDate = Find(frame->documentElement(),"input","name","PLAIN_REGDT").attribute("value");
|
||||||
|
//strDate += " 00:00:00";
|
||||||
|
}
|
||||||
|
else
|
||||||
|
strDate += ":00";
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
QWebElement group = Find(Find(frame->documentElement(),"div","class","article_writer"),"a","href","#");
|
||||||
|
strNick = group.toPlainText().trimmed();
|
||||||
|
|
||||||
|
if (strNick.isEmpty() == false)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
QStringList list = strNick.split("(");
|
||||||
|
if (list.isEmpty() == false)
|
||||||
|
strNick = list.at(0);
|
||||||
|
*/
|
||||||
|
QWebElement id = Find(frame->documentElement(),"td","class","m-tcol-c b nick");
|
||||||
|
QStringList list = Find(id,"a","class","m-tcol-c b").attribute("onclick").trimmed().split(",");
|
||||||
|
if (list.size() >= 2)
|
||||||
|
strID = list.at(1).trimmed().replace("'","");
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
QStringList list = Find(Find(frame->documentElement(),"td","class","m-tcol-c b nick"),"a","class","m-tcol-c b").attribute("onclick").trimmed().split(",");
|
||||||
|
if (list.size() >= 4)
|
||||||
|
{
|
||||||
|
strID = list.at(1).trimmed().replace("'","");
|
||||||
|
strNick = list.at(3).trimmed().replace("'","");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
QStringList strList = Find(frame->documentElement(),"div","class","article_writer").toPlainText().split("|");
|
||||||
|
|
||||||
|
foreach(QString str,strList)
|
||||||
|
{
|
||||||
|
QStringList substrList = str.split(" ");
|
||||||
|
for(int i = 0;i < substrList.size();i++)
|
||||||
|
{
|
||||||
|
if((substrList.at(i).trimmed().compare("조회") == 0) || (substrList.at(i).trimmed().compare("\"조회\"") == 0))
|
||||||
|
{
|
||||||
|
strHits = substrList.at(i+1).trimmed();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
if (strHits.isEmpty())
|
||||||
|
{
|
||||||
|
strHits = Find(frame->documentElement(),"span","class","kin_count m-tcol-c _rosReadcount").toPlainText();
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
{
|
||||||
|
QSqlQuery sql;
|
||||||
|
QString strQuery = "update ";
|
||||||
|
strQuery += m_strTable;
|
||||||
|
strQuery += " set ";
|
||||||
|
strQuery += "article_data = '" + strData + "',";
|
||||||
|
strQuery += "article_date = '" + strDate + "',";
|
||||||
|
strQuery += "article_nickname = '" + strNick + "',";
|
||||||
|
if(!strID.isEmpty())
|
||||||
|
strQuery += "article_id = '" + strID + "',";
|
||||||
|
strQuery += "article_hit = '" + strHits + "'";
|
||||||
|
strQuery += "where article_url='";
|
||||||
|
strQuery += m_strUrl;
|
||||||
|
strQuery += "'";
|
||||||
|
QString strUtf8(strQuery.toUtf8());
|
||||||
|
if (sql.exec(strUtf8) == false)
|
||||||
|
cout << "error : " << sql.lastError().text().toStdString();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Comment
|
||||||
|
{
|
||||||
|
QWebElement group = Find(frame->documentElement(),"div","class","commentDiv bg_sub");
|
||||||
|
QList<QWebElement> elements = FindAllMid(group,"div","id","_cmt-",0,5);
|
||||||
|
/*
|
||||||
|
* foreach(QWebElement element, elements)
|
||||||
|
cout << "element = " << element.toPlainText().toStdString() << endl;
|
||||||
|
*/
|
||||||
|
QString commHidden = "comment_hidden";
|
||||||
|
QString commPos = "comment_pos";
|
||||||
|
QString commReComm = "recomment_pos";
|
||||||
|
QString strParent;
|
||||||
|
int nCount = 0;
|
||||||
|
foreach (QWebElement element, elements)
|
||||||
|
{
|
||||||
|
|
||||||
|
if (element.attribute("class").trimmed().right(commHidden.length()).compare(commHidden,Qt::CaseInsensitive) != 0){
|
||||||
|
if (element.attribute("class").trimmed().left(commPos.length()).compare(commPos,Qt::CaseInsensitive) == 0)
|
||||||
|
{
|
||||||
|
QString strData = SqlString(Find(element,"span","class","comment_contents").toPlainText().trimmed());
|
||||||
|
if (strData.isEmpty()) continue;
|
||||||
|
strData = GetSafeUtf(strData);
|
||||||
|
QString strID = Find(element,"input","name","writerid").attribute("value").trimmed();
|
||||||
|
QString strNick = Find(element,"a","class","b").toPlainText().trimmed();
|
||||||
|
strParent = strNick;
|
||||||
|
QString strDate = Find(element,"span","class","comment_date txt_sub p11 ls0").toPlainText().trimmed().replace(".","-").replace("- "," ");
|
||||||
|
if (strDate.isEmpty()) continue;
|
||||||
|
else strDate += ":00";
|
||||||
|
QSqlQuery query;
|
||||||
|
//query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,reply_url,article_order) VALUES ('daum','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:URLREPLY,:ROWNUM)").toUtf8());
|
||||||
|
//query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,article_order) VALUES ('daum','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:ROWNUM)").toUtf8());
|
||||||
|
query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,article_order,platform_id,article_hit,platform_title) VALUES ('daum','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:ROWNUM,:PLATFORMID,:HITS,:TITLE)").toUtf8());
|
||||||
|
query.bindValue(":URL",m_strUrl.toUtf8());
|
||||||
|
query.bindValue(":ID",strID.toUtf8());
|
||||||
|
query.bindValue(":NICK",strNick.toUtf8());
|
||||||
|
query.bindValue(":DATA",strData.toUtf8());
|
||||||
|
query.bindValue(":DATE",strDate.toUtf8());
|
||||||
|
//query.bindValue(":URLREPLY",m_strReper.toUtf8());
|
||||||
|
query.bindValue(":ROWNUM",nCount++);
|
||||||
|
query.bindValue(":PLATFORMID",m_strUrl.split("/").at(1).toUtf8());
|
||||||
|
query.bindValue(":HITS",strHits.toUtf8());
|
||||||
|
query.bindValue(":TITLE",strTitle.toUtf8());
|
||||||
|
|
||||||
|
if (query.exec()==false)
|
||||||
|
cout << "error : " << query.lastError().text().toStdString();
|
||||||
|
}
|
||||||
|
if (element.attribute("class").trimmed().left(commReComm.length()).compare(commReComm,Qt::CaseInsensitive) == 0)
|
||||||
|
{
|
||||||
|
QString strData = SqlString(Find(element,"span","class","comment_contents").toPlainText().trimmed());
|
||||||
|
if (strData.isEmpty()) continue;
|
||||||
|
|
||||||
|
QString strReParent = Find(element,"span","class","mention_nicknames text_counter txt_point b").toPlainText().trimmed();
|
||||||
|
if(strReParent.length() == 0)
|
||||||
|
strReParent = strParent;
|
||||||
|
|
||||||
|
QString strID = Find(element,"input","name","writerid").attribute("value").trimmed();
|
||||||
|
QString strNick = Find(element,"a","class","b").toPlainText().trimmed();
|
||||||
|
QString strDate = Find(element,"span","class","comment_date txt_sub p11 ls0").toPlainText().trimmed().replace(".","-").replace("- "," ");
|
||||||
|
if (strDate.isEmpty()) continue;
|
||||||
|
else strDate += ":00";
|
||||||
|
|
||||||
|
QSqlQuery query;
|
||||||
|
//query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,article_parent,reply_url,article_order) VALUES ('daum','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:PARENT,:URLREPLY,:ROWNUM)").toUtf8());
|
||||||
|
//query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,article_parent,article_order) VALUES ('daum','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:PARENT,:ROWNUM)").toUtf8());
|
||||||
|
query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,article_parent,article_order,platform_id,article_hit,platform_title) VALUES ('daum','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:PARENT,:ROWNUM,:PLATFORMID,:HITS,:TITLE)").toUtf8());
|
||||||
|
query.bindValue(":URL",m_strUrl.toUtf8());
|
||||||
|
query.bindValue(":ID",strID.toUtf8());
|
||||||
|
query.bindValue(":NICK",strNick.toUtf8());
|
||||||
|
query.bindValue(":DATA",strData.toUtf8());
|
||||||
|
query.bindValue(":DATE",strDate.toUtf8());
|
||||||
|
query.bindValue(":PARENT",strReParent.toUtf8());
|
||||||
|
//query.bindValue(":URLREPLY",m_strReper.toUtf8());
|
||||||
|
query.bindValue(":ROWNUM",nCount++);
|
||||||
|
query.bindValue(":PLATFORMID",m_strUrl.split("/").at(1).toUtf8());
|
||||||
|
query.bindValue(":HITS",strHits.toUtf8());
|
||||||
|
query.bindValue(":TITLE",strTitle.toUtf8());
|
||||||
|
//QWebView::page()->mainFrame()->evaluateJavaScript("");
|
||||||
|
if (query.exec()==false)
|
||||||
|
cout << "error : " << query.lastError().text().toStdString();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
m_bUse = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
foreach(QWebFrame *childFrame, frame->childFrames())
|
||||||
|
saveFrameDaumCafeUrl(childFrame);
|
||||||
|
}
|
||||||
|
|
||||||
|
void SCrawler::saveFrameDaumBlogUrl(QWebFrame *frame){}
|
||||||
|
void SCrawler::saveFrameDaumBlogComment(QWebFrame *frame){}
|
||||||
|
void SCrawler::saveFrameDaumBlogList(QWebFrame *frame){}
|
||||||
|
|
||||||
QWebElement SCrawler::Find(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind)
|
QWebElement SCrawler::Find(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind)
|
||||||
{
|
{
|
||||||
QWebElementCollection elements = _FindElement.findAll(_strElement);
|
QWebElementCollection elements = _FindElement.findAll(_strElement);
|
||||||
@@ -790,3 +1238,65 @@ QWebElement SCrawler::Find(const QWebElement _FindElement,const QString _strElem
|
|||||||
QWebElement element;
|
QWebElement element;
|
||||||
return element;
|
return element;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
QWebElement SCrawler::FindMid(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind, const int _strStart, const int _strLength)
|
||||||
|
{
|
||||||
|
QWebElementCollection elements = _FindElement.findAll(_strElement);
|
||||||
|
foreach (QWebElement element, elements)
|
||||||
|
{
|
||||||
|
QString str = element.attribute(_strAttrib).trimmed().mid(_strStart,_strLength);
|
||||||
|
if (QString::compare(str,_strFind,Qt::CaseInsensitive)==0)
|
||||||
|
{
|
||||||
|
return element;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
QWebElement element;
|
||||||
|
return element;
|
||||||
|
}
|
||||||
|
|
||||||
|
QWebElement SCrawler::FindMid(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind, const int _strStart)
|
||||||
|
{
|
||||||
|
int _strLength = _strFind.length();
|
||||||
|
return FindMid(_FindElement, _strElement, _strAttrib, _strFind, _strStart, _strLength);
|
||||||
|
}
|
||||||
|
|
||||||
|
QWebElement SCrawler::FindRight(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind)
|
||||||
|
{
|
||||||
|
QWebElementCollection elements = _FindElement.findAll(_strElement);
|
||||||
|
foreach (QWebElement element, elements)
|
||||||
|
{
|
||||||
|
QString str = element.attribute(_strAttrib).trimmed().right(_strFind.length());
|
||||||
|
cout << "FindRight : " << str.toStdString() << endl;
|
||||||
|
cout << "FindRight right : " << _strFind.toStdString() << endl;
|
||||||
|
if (QString::compare(str,_strFind,Qt::CaseInsensitive)==0)
|
||||||
|
{
|
||||||
|
return element;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
QWebElement element;
|
||||||
|
return element;
|
||||||
|
}
|
||||||
|
|
||||||
|
QWebElement SCrawler::FindLeft(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind)
|
||||||
|
{
|
||||||
|
int _strStart = 0;
|
||||||
|
return FindMid(_FindElement, _strElement, _strAttrib, _strFind, _strStart);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
QList<QWebElement> SCrawler::FindAllMid(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind, const int _strStart, const int _strLength)
|
||||||
|
{
|
||||||
|
QWebElementCollection elements = _FindElement.findAll(_strElement);
|
||||||
|
QList<QWebElement> returnElements = QList<QWebElement>();
|
||||||
|
|
||||||
|
foreach (QWebElement element, elements)
|
||||||
|
{
|
||||||
|
QString str = element.attribute(_strAttrib).trimmed().mid(_strStart,_strLength);
|
||||||
|
if (QString::compare(str,_strFind,Qt::CaseInsensitive)==0)
|
||||||
|
{
|
||||||
|
returnElements.append(element);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return returnElements;
|
||||||
|
}
|
||||||
|
|||||||
@@ -14,6 +14,11 @@ public:
|
|||||||
E_NAVER_BLOG_LIST,
|
E_NAVER_BLOG_LIST,
|
||||||
E_NAVER_BLOG_BODY,
|
E_NAVER_BLOG_BODY,
|
||||||
E_NAVER_BLOG_REPLY,
|
E_NAVER_BLOG_REPLY,
|
||||||
|
E_DAUM_CAFE_LIST,
|
||||||
|
E_DAUM_CAFE_DATA,
|
||||||
|
E_DAUM_BLOG_LIST,
|
||||||
|
E_DAUM_BLOG_BODY,
|
||||||
|
E_DAUM_BLOG_REPLY
|
||||||
};
|
};
|
||||||
public:
|
public:
|
||||||
SCrawler();
|
SCrawler();
|
||||||
@@ -46,8 +51,20 @@ private:
|
|||||||
void saveFrameUrl(QWebFrame *frame);
|
void saveFrameUrl(QWebFrame *frame);
|
||||||
void saveFrameComment(QWebFrame *frame);
|
void saveFrameComment(QWebFrame *frame);
|
||||||
void saveFrameCafeUrl(QWebFrame *frame);
|
void saveFrameCafeUrl(QWebFrame *frame);
|
||||||
|
void saveFrameDaumBlogList(QWebFrame *frame);
|
||||||
|
void saveFrameDaumCafeList(QWebFrame *frame);
|
||||||
|
void saveFrameDaumBlogUrl(QWebFrame *frame);
|
||||||
|
void saveFrameDaumBlogComment(QWebFrame *frame);
|
||||||
|
void saveFrameDaumCafeUrl(QWebFrame *frame);
|
||||||
int GetNumber(QString _str);
|
int GetNumber(QString _str);
|
||||||
|
|
||||||
|
|
||||||
QWebElement Find(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind);
|
QWebElement Find(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind);
|
||||||
|
QWebElement FindMid(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind, const int _strStart, const int _strLength);
|
||||||
|
QWebElement FindMid(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind, const int _strStart);
|
||||||
|
QWebElement FindRight(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind);
|
||||||
|
QWebElement FindLeft(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind);
|
||||||
|
QList<QWebElement> FindAllMid(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind, const int _strStart, const int _strLength);
|
||||||
QWebElementCollection Finds(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind);
|
QWebElementCollection Finds(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind);
|
||||||
void UpdateError(QString _strError);
|
void UpdateError(QString _strError);
|
||||||
};
|
};
|
||||||
|
|||||||
Reference in New Issue
Block a user