diff --git a/CrawlerProcess/scrawler.cpp b/CrawlerProcess/scrawler.cpp index 19fbc1a..3819875 100644 --- a/CrawlerProcess/scrawler.cpp +++ b/CrawlerProcess/scrawler.cpp @@ -121,6 +121,101 @@ void SCrawler::load(QStringList _strlistArgv) m_strTable = "data_" + _strlistArgv[3]; } + + if (_strlistArgv[0] == "daum") + { + if (_strlistArgv[1] == "cafe_list") + { + m_strUrl = _strlistArgv[2]; + m_nSelect = E_DAUM_CAFE_LIST; + m_strKeywordID = _strlistArgv[4]; + + QFile file("proxy.txt"); + if (file.open(QIODevice::ReadOnly | QIODevice::Text)) + { + QVector vecProxy; + while (!file.atEnd()) + { + QString str = QString(file.readLine()); + if (str.isEmpty()) continue; + vecProxy.push_back(str.split(",")); + } + if (vecProxy.size() > 0) + { + QStringList strList = vecProxy.at(rand()%vecProxy.size()); + switch(strList.size()) + { + case 1: + cout << "p : " << strList.at(0).toStdString() << endl; + QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0)))); + break; + case 2: + cout << "p : " << strList.at(0).toStdString() << endl; + QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt()))); + break; + } + } + } + } + + if (_strlistArgv[1] == "cafe_data") + { + m_strUrl = _strlistArgv[2]; + m_nSelect = E_DAUM_CAFE_DATA; + m_strReper = _strlistArgv[4]; + } + + if (_strlistArgv[1] == "blog_list") + { + m_strUrl = _strlistArgv[2]; + m_nSelect = E_DAUM_BLOG_LIST; + m_strKeywordID = _strlistArgv[4]; + //cout << "ok"; + + QFile file("proxy.txt"); + if (file.open(QIODevice::ReadOnly | QIODevice::Text)) + { + QVector vecProxy; + while (!file.atEnd()) + { + QString str = QString(file.readLine()); + if (str.isEmpty()) continue; + vecProxy.push_back(str.split(",")); + } + if (vecProxy.size() > 0) + { + QStringList strList = vecProxy.at(rand()%vecProxy.size()); + switch(strList.size()) + { + case 1: + cout << "p : " << strList.at(0).toStdString() << endl; + QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0)))); + break; + case 2: + cout << "p : " << strList.at(0).toStdString() << endl; + QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt()))); + break; + } + } + } + } + + if (_strlistArgv[1] == "blog_url") + { + m_strUrl = _strlistArgv[2]; + m_nSelect = E_DAUM_BLOG_BODY; + //m_strReper = _strlistArgv[4]; + m_bUse = true; + } + + if (_strlistArgv[1] == "blog_comm") + { + m_strUrl = _strlistArgv[2]; + m_nSelect = E_DAUM_BLOG_REPLY; + } + m_strTable = "data_" + _strlistArgv[3]; + } + cout << m_strUrl.toStdString() << endl; QUrl url = QUrl(m_strUrl); @@ -174,20 +269,30 @@ void SCrawler::saveResult(bool ok) case E_NAVER_BLOG_LIST:saveFrameList(m_page->mainFrame());break; case E_NAVER_BLOG_BODY:saveFrameUrl(m_page->mainFrame());break; case E_NAVER_BLOG_REPLY:saveFrameComment(m_page->mainFrame());break; + case E_DAUM_CAFE_LIST:saveFrameDaumCafeList(m_page->mainFrame());break; + case E_DAUM_CAFE_DATA:saveFrameDaumCafeUrl(m_page->mainFrame());break; + case E_DAUM_BLOG_LIST:saveFrameDaumBlogList(m_page->mainFrame());break; + case E_DAUM_BLOG_BODY:saveFrameDaumBlogUrl(m_page->mainFrame());break; + case E_DAUM_BLOG_REPLY:saveFrameDaumBlogComment(m_page->mainFrame());break; } switch(m_nSelect) { case E_NAVER_CAFE_LIST: case E_NAVER_BLOG_LIST: + case E_DAUM_CAFE_LIST: + case E_DAUM_BLOG_LIST: if (m_bError) cout << "block";// block if (m_bLast) cout << "last"; break; case E_NAVER_BLOG_REPLY: + case E_DAUM_BLOG_REPLY: cout << "ok"; break; case E_NAVER_CAFE_DATA: case E_NAVER_BLOG_BODY: + case E_DAUM_CAFE_DATA: + case E_DAUM_BLOG_BODY: if (m_bUse == false) { cout << "fail"; @@ -270,7 +375,19 @@ void SCrawler::saveFrameList(QWebFrame *frame) } QStringList strList = strUrl.split('/'); - if (strList.at(0).compare("blog.naver.com") != 0 ) { cout << "x http://" << strUrl.toStdString() <frameName() + QString::number(cz++) + ".html",frame->toHtml()); - */ + // Debug(frame->frameName() + QString::number(cz++) + ".html",frame->toHtml()); QSqlQuery sql; if (frame->frameName().compare(QString("BuddyConnectIframe")) == 0) @@ -351,15 +472,43 @@ void SCrawler::saveFrameUrl(QWebFrame *frame) } } } +/* + QString strHtml2 = frame->toHtml(); + QString strFind2 = "blogpfthumb"; + int start = strHtml2.indexOf(strFind2); + cout << "start = " << start << endl; + + QString str222 = strHtml2.mid(start,30); + + if(start != -1) + cout << "start String = " << str222.toStdString() << endl; + + + QWebElement image2 = Find(frame->documentElement(),"div","id","blog-profile"); + cout << "p class image = " << image2.toInnerXml().toStdString() << endl; + image2 = Find(image2,"a","href","#"); + image2 = Find(image2,"img","alt","프로필 이미지"); + cout << "outer image profile = " << image2.attribute("src").toStdString() << endl; + image2 = FindMid(frame->documentElement(),"img","src","http://blogpfthumb",0,18); + cout << "outer image findmid = " << image2.attribute("src").toStdString() << endl; +*/ if (frame->frameName().compare(QString("mainFrame")) == 0) { QString str[E_DATA_MAX]; - //QString strHead[E_DATA_MAX] = {"article_nickname","article_id","article_title","article_date","article_data","platform_title"}; - QString strHead[E_DATA_MAX] = {"article_nickname","article_id","article_title","article_date","article_data"}; - QWebElement proTitle = Find(frame->documentElement(),"td","id","blogTitleText"); - // str[E_DATA_PLATFORM_TITLE] = proTitle.toPlainText().trimmed(); + QString strHead[E_DATA_MAX] = {"article_nickname","article_id","article_title","article_date","article_data","platform_title"}; + QString strSympathy; + QWebElement proTitle = Find(frame->documentElement(),"meta","property","og:article:author"); + str[E_DATA_PLATFORM_TITLE] = proTitle.attribute("content").split("|").at(1).trimmed(); + if(str[E_DATA_PLATFORM_TITLE].length() > 0) + str[E_DATA_PLATFORM_TITLE] = GetSafeUtf(str[E_DATA_PLATFORM_TITLE]); + else + { + proTitle = Find(frame->documentElement(),"span","id","blogTitleName"); + str[E_DATA_PLATFORM_TITLE] = GetSafeUtf(proTitle.toPlainText().trimmed()); + } + QWebElement image; QWebElement profile = Find(frame->documentElement(),"div","id","blog-profile"); { QWebElement nick = Find(profile,"strong","id","nickNameArea"); @@ -384,7 +533,8 @@ void SCrawler::saveFrameUrl(QWebFrame *frame) int end = strHtml.indexOf("'",start + strFind.length()); str[E_DATA_NICK] = strHtml.mid(start + strFind.length(),end-start-strFind.length()); } - } + } + str[E_DATA_NICK] = GetSafeUtf(str[E_DATA_NICK]); if (m_strUrl.split("/").at(3) == str[E_DATA_NICK]) { str[E_DATA_ID] = str[E_DATA_NICK]; @@ -399,16 +549,34 @@ void SCrawler::saveFrameUrl(QWebFrame *frame) } if (str[E_DATA_ID].isEmpty()) - str[E_DATA_ID] = m_strUrl.split("/").at(3); + { + if((m_strUrl.split("/").at(2).compare("blog.naver.com") == 0)) + str[E_DATA_ID] = m_strUrl.split("/").at(3); + else + str[E_DATA_ID] = m_strUrl.split("/").at(2).split(".").at(0); + } } - //QWebElement image = Find(profile,"img","alt","프로필 이미지"); + if(str[E_DATA_NICK].length() == 0) + str[E_DATA_NICK] = str[E_DATA_ID]; + + image = Find(profile,"img","alt","프로필 이미지"); + + //strSympathy = FindLeft(Find(frame->documentElement(),"p","class","postre"),"a","class","pcol2 _symList").toPlainText().split(" ").at(1); + + /* + cout << "inner image = " << image.attribute("src").toStdString() << endl; + image = FindMid(profile,"img","src","http://blogpfthumb",0,18); + cout << "inner image FindMid = " << image.attribute("src").toStdString() << endl; + cout << "str[E_DATA_ID] = " << str[E_DATA_ID].toStdString() << ", str[E_DATA_NICK] = " << str[E_DATA_NICK].toStdString() << endl; + */ + } { QWebElement post = Find(frame->documentElement(),"div","id","postListBody"); QWebElement post_top = Find(post,"table","class","post-top"); { - QWebElement title = Find(post_top,"div","class","htitle"); + QWebElement title = Find(post_top,"span","class","pcol1 itemSubjectBoldfont"); if (title.toPlainText().isEmpty()==false) { str[E_DATA_TITLE] = title.toPlainText(); @@ -441,22 +609,29 @@ void SCrawler::saveFrameUrl(QWebFrame *frame) } QString strQuery = "update " + m_strTable + " set "; - for(int i = 0; i < E_DATA_MAX - 1 ; i++) + for(int i = 0; i < E_DATA_MAX ; i++) { strQuery += strHead[i]; strQuery += "='"; //strQuery += GetSafeUtf(SqlString(str[i].trimmed())); strQuery += str[i].trimmed(); strQuery += "'"; - if( i != (E_DATA_MAX - 2) ) + if( i != (E_DATA_MAX - 1) ) strQuery += ","; } + if(image.attribute("src").trimmed().length() != 0) + { + strQuery += ", "; + strQuery += "article_profileurl='"; + strQuery += image.attribute("src").trimmed(); + strQuery += "'"; + } strQuery += " where article_url='"; strQuery += m_strUrl; strQuery += "'"; + //cout << "strQuery = " << strQuery.toStdString() << endl; QString strUtf8(strQuery.toUtf8()); - //qDebug() << strQuery; - //cout << "Query : " << strQuery.toStdString() << endl; + if (sql.exec(strUtf8)==false) { cout << "error : " << sql.lastError().text().toStdString(); @@ -475,16 +650,18 @@ void SCrawler::saveFrameComment(QWebFrame *frame) QWebElementCollection elements = group.findAll("li"); QString strParent,strDate,strNick,strComm,strUrl; QStringList strList = m_strUrl.split("/"); + for (int i=0; i < 5; i++) strUrl += strList.at(i) + "/"; strUrl = strUrl.left(strUrl.size()-1); + int nCount=0; foreach (QWebElement element, elements) { if (element.attribute("class") == "_countableComment ") { - strNick = strParent = Find(element,"a","class","nick pcol2").toPlainText(); + strNick = strParent = GetSafeUtf(Find(element,"a","class","nick pcol2").toPlainText()); strDate = Find(element,"span","class","date fil5 pcol2").toPlainText(); strComm = Find(element,"dd","class","comm pcol2").toPlainText(); strComm = GetSafeUtf(strComm); @@ -568,6 +745,7 @@ void SCrawler::saveFrameCafeList(QWebFrame *frame) if (m_bUse == true) return; static int cz = 0; + // Debug(frame->frameName() + QString::number(cz++) + ".html",frame->toHtml()); QWebElement eleMain = Find(frame->documentElement(),"div","class","cafe_article section _cafeBase"); @@ -777,6 +955,276 @@ void SCrawler::saveFrameCafeUrl(QWebFrame *frame) saveFrameCafeUrl(childFrame); } + +void SCrawler::saveFrameDaumCafeList(QWebFrame *frame) +{ + if (m_bUse == true) return; + + static int cz = 0; + // Debug(frame->frameName() + QString::number(cz++) + ".html",frame->toHtml()); + + QWebElement eleMain = Find(frame->documentElement(),"div","class","type_fulltext wid_f"); + foreach(QWebElement eleSub,eleMain.findAll("div")) + { + if (eleSub.attribute("class") == "wrap_cont") + { + QString strUrl,strTitle; + foreach(QWebElement eleSubUrl,eleSub.findAll("a")) + { + if (eleSubUrl.attribute("class") == "f_url") + strUrl = eleSubUrl.attribute("href"); + + if (eleSubUrl.attribute("class") == "f_link_bu f_l") + strTitle = GetSafeUtf(SqlString(eleSubUrl.toPlainText().trimmed())); + } + + if (strUrl.split("/").at(2) == "cafe.daum.net") + { + QSqlQuery sql; + /* + QString strQuery = "select URL from "; + strQuery += m_strTableBody; + strQuery += QString(" where URL = '%1'").arg(strUrl); + sql.exec(strQuery); + if (sql.size() == 0) + */ + { + QString strQuery = QString("insert into "); + strQuery += m_strTable; + strQuery += QString(" set platform_name='daum',platform_form='cafe',article_form='body',article_url='%1',platform_id='%2',article_title='%3',keyword_id='%4'").arg(strUrl).arg(strUrl.split("/").at(3)).arg(strTitle).arg(m_strKeywordID); + QString strUtf8(strQuery.toUtf8()); + if (sql.exec(strUtf8) == false) + cout << "x " << sql.lastError().text().toStdString(); + else + cout << "o " << strUrl.toStdString() << endl; + } + } + m_bUse = true; + } + } + + { + QWebElement total = Find(eleMain,"span","class","f_nb f_l"); + if (total.toPlainText().isEmpty()) {m_bError = true; return;} + total.toPlainText().split("/").size(); + QStringList strList = total.toPlainText().split("/").at(0).trimmed().split("-"); + int nNow = GetNumber(strList.at(strList.size() - 1)); + int nNowFirst = GetNumber(strList.at(strList.size() - 2)); + if (nNow >= 1000 || (nNow - nNowFirst) < 9) + m_bLast = true; + } +} + + +void SCrawler::saveFrameDaumCafeUrl(QWebFrame *frame) +{ + if (m_bUse) return; + + + QWebElement other = frame->documentElement().findFirst("title"); + QString strTitle = other.toPlainText().trimmed().split("|").at(0).trimmed(); + + if (strTitle.isEmpty() == false) + { + QString strQuery = "update "; + strQuery += m_strTable; + strQuery += " set "; + strQuery += "platform_title = '" + SqlString(GetSafeUtf(strTitle)) + "'"; + strQuery += "where article_url='"; + strQuery += m_strUrl; + strQuery += "'"; + QString strUtf8(strQuery.toUtf8()); + QSqlQuery sql; + if (sql.exec(strUtf8) == false) + cout << "error : " << sql.lastError().text().toStdString(); + } + + if (frame->frameName() == "down") + { + QString strHits; + { + //QString strData,strDate,strNick,strID,strHits; + QString strData,strDate,strNick,strID; + { + QWebElement group = Find(frame->documentElement(),"div","class","bbs_contents"); + strData = SqlString(group.toPlainText().trimmed()); + strData = GetSafeUtf(strData); + } + { + QWebElement group = Find(frame->documentElement(),"span","class","p11 ls0"); + strDate = group.toPlainText().trimmed().replace(".","-"); + strDate = strDate.replace("- "," "); + if (strDate.isEmpty() == true) + { + //QWebElement subgroup = Find(frame->documentElement(),"input","name","PLAIN_REGDT").attribute("value"); + //strDate = subgroup.toPlainText().trimmed(); + strDate = Find(frame->documentElement(),"input","name","PLAIN_REGDT").attribute("value"); + //strDate += " 00:00:00"; + } + else + strDate += ":00"; + } + + { + QWebElement group = Find(Find(frame->documentElement(),"div","class","article_writer"),"a","href","#"); + strNick = group.toPlainText().trimmed(); + + if (strNick.isEmpty() == false) + { + /* + QStringList list = strNick.split("("); + if (list.isEmpty() == false) + strNick = list.at(0); + */ + QWebElement id = Find(frame->documentElement(),"td","class","m-tcol-c b nick"); + QStringList list = Find(id,"a","class","m-tcol-c b").attribute("onclick").trimmed().split(","); + if (list.size() >= 2) + strID = list.at(1).trimmed().replace("'",""); + } + else + { + QStringList list = Find(Find(frame->documentElement(),"td","class","m-tcol-c b nick"),"a","class","m-tcol-c b").attribute("onclick").trimmed().split(","); + if (list.size() >= 4) + { + strID = list.at(1).trimmed().replace("'",""); + strNick = list.at(3).trimmed().replace("'",""); + } + } + } + + QStringList strList = Find(frame->documentElement(),"div","class","article_writer").toPlainText().split("|"); + + foreach(QString str,strList) + { + QStringList substrList = str.split(" "); + for(int i = 0;i < substrList.size();i++) + { + if((substrList.at(i).trimmed().compare("조회") == 0) || (substrList.at(i).trimmed().compare("\"조회\"") == 0)) + { + strHits = substrList.at(i+1).trimmed(); + break; + } + } + } + /* + if (strHits.isEmpty()) + { + strHits = Find(frame->documentElement(),"span","class","kin_count m-tcol-c _rosReadcount").toPlainText(); + } + */ + { + QSqlQuery sql; + QString strQuery = "update "; + strQuery += m_strTable; + strQuery += " set "; + strQuery += "article_data = '" + strData + "',"; + strQuery += "article_date = '" + strDate + "',"; + strQuery += "article_nickname = '" + strNick + "',"; + if(!strID.isEmpty()) + strQuery += "article_id = '" + strID + "',"; + strQuery += "article_hit = '" + strHits + "'"; + strQuery += "where article_url='"; + strQuery += m_strUrl; + strQuery += "'"; + QString strUtf8(strQuery.toUtf8()); + if (sql.exec(strUtf8) == false) + cout << "error : " << sql.lastError().text().toStdString(); + } + } + // Comment + { + QWebElement group = Find(frame->documentElement(),"div","class","commentDiv bg_sub"); + QList elements = FindAllMid(group,"div","id","_cmt-",0,5); + /* + * foreach(QWebElement element, elements) + cout << "element = " << element.toPlainText().toStdString() << endl; + */ + QString commHidden = "comment_hidden"; + QString commPos = "comment_pos"; + QString commReComm = "recomment_pos"; + QString strParent; + int nCount = 0; + foreach (QWebElement element, elements) + { + + if (element.attribute("class").trimmed().right(commHidden.length()).compare(commHidden,Qt::CaseInsensitive) != 0){ + if (element.attribute("class").trimmed().left(commPos.length()).compare(commPos,Qt::CaseInsensitive) == 0) + { + QString strData = SqlString(Find(element,"span","class","comment_contents").toPlainText().trimmed()); + if (strData.isEmpty()) continue; + strData = GetSafeUtf(strData); + QString strID = Find(element,"input","name","writerid").attribute("value").trimmed(); + QString strNick = Find(element,"a","class","b").toPlainText().trimmed(); + strParent = strNick; + QString strDate = Find(element,"span","class","comment_date txt_sub p11 ls0").toPlainText().trimmed().replace(".","-").replace("- "," "); + if (strDate.isEmpty()) continue; + else strDate += ":00"; + QSqlQuery query; + //query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,reply_url,article_order) VALUES ('daum','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:URLREPLY,:ROWNUM)").toUtf8()); + //query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,article_order) VALUES ('daum','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:ROWNUM)").toUtf8()); + query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,article_order,platform_id,article_hit,platform_title) VALUES ('daum','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:ROWNUM,:PLATFORMID,:HITS,:TITLE)").toUtf8()); + query.bindValue(":URL",m_strUrl.toUtf8()); + query.bindValue(":ID",strID.toUtf8()); + query.bindValue(":NICK",strNick.toUtf8()); + query.bindValue(":DATA",strData.toUtf8()); + query.bindValue(":DATE",strDate.toUtf8()); + //query.bindValue(":URLREPLY",m_strReper.toUtf8()); + query.bindValue(":ROWNUM",nCount++); + query.bindValue(":PLATFORMID",m_strUrl.split("/").at(1).toUtf8()); + query.bindValue(":HITS",strHits.toUtf8()); + query.bindValue(":TITLE",strTitle.toUtf8()); + + if (query.exec()==false) + cout << "error : " << query.lastError().text().toStdString(); + } + if (element.attribute("class").trimmed().left(commReComm.length()).compare(commReComm,Qt::CaseInsensitive) == 0) + { + QString strData = SqlString(Find(element,"span","class","comment_contents").toPlainText().trimmed()); + if (strData.isEmpty()) continue; + + QString strReParent = Find(element,"span","class","mention_nicknames text_counter txt_point b").toPlainText().trimmed(); + if(strReParent.length() == 0) + strReParent = strParent; + + QString strID = Find(element,"input","name","writerid").attribute("value").trimmed(); + QString strNick = Find(element,"a","class","b").toPlainText().trimmed(); + QString strDate = Find(element,"span","class","comment_date txt_sub p11 ls0").toPlainText().trimmed().replace(".","-").replace("- "," "); + if (strDate.isEmpty()) continue; + else strDate += ":00"; + + QSqlQuery query; + //query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,article_parent,reply_url,article_order) VALUES ('daum','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:PARENT,:URLREPLY,:ROWNUM)").toUtf8()); + //query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,article_parent,article_order) VALUES ('daum','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:PARENT,:ROWNUM)").toUtf8()); + query.prepare(QString("insert into " + m_strTable + " (platform_name,platform_form,article_form,article_url,article_id,article_nickname,article_data,article_date,article_parent,article_order,platform_id,article_hit,platform_title) VALUES ('daum','cafe','reply',:URL,:ID,:NICK,:DATA,:DATE,:PARENT,:ROWNUM,:PLATFORMID,:HITS,:TITLE)").toUtf8()); + query.bindValue(":URL",m_strUrl.toUtf8()); + query.bindValue(":ID",strID.toUtf8()); + query.bindValue(":NICK",strNick.toUtf8()); + query.bindValue(":DATA",strData.toUtf8()); + query.bindValue(":DATE",strDate.toUtf8()); + query.bindValue(":PARENT",strReParent.toUtf8()); + //query.bindValue(":URLREPLY",m_strReper.toUtf8()); + query.bindValue(":ROWNUM",nCount++); + query.bindValue(":PLATFORMID",m_strUrl.split("/").at(1).toUtf8()); + query.bindValue(":HITS",strHits.toUtf8()); + query.bindValue(":TITLE",strTitle.toUtf8()); + //QWebView::page()->mainFrame()->evaluateJavaScript(""); + if (query.exec()==false) + cout << "error : " << query.lastError().text().toStdString(); + } + } + } + } + m_bUse = true; + } + + foreach(QWebFrame *childFrame, frame->childFrames()) + saveFrameDaumCafeUrl(childFrame); +} + +void SCrawler::saveFrameDaumBlogUrl(QWebFrame *frame){} +void SCrawler::saveFrameDaumBlogComment(QWebFrame *frame){} +void SCrawler::saveFrameDaumBlogList(QWebFrame *frame){} + QWebElement SCrawler::Find(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind) { QWebElementCollection elements = _FindElement.findAll(_strElement); @@ -790,3 +1238,65 @@ QWebElement SCrawler::Find(const QWebElement _FindElement,const QString _strElem QWebElement element; return element; } + +QWebElement SCrawler::FindMid(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind, const int _strStart, const int _strLength) +{ + QWebElementCollection elements = _FindElement.findAll(_strElement); + foreach (QWebElement element, elements) + { + QString str = element.attribute(_strAttrib).trimmed().mid(_strStart,_strLength); + if (QString::compare(str,_strFind,Qt::CaseInsensitive)==0) + { + return element; + } + } + QWebElement element; + return element; +} + +QWebElement SCrawler::FindMid(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind, const int _strStart) +{ + int _strLength = _strFind.length(); + return FindMid(_FindElement, _strElement, _strAttrib, _strFind, _strStart, _strLength); +} + +QWebElement SCrawler::FindRight(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind) +{ + QWebElementCollection elements = _FindElement.findAll(_strElement); + foreach (QWebElement element, elements) + { + QString str = element.attribute(_strAttrib).trimmed().right(_strFind.length()); + cout << "FindRight : " << str.toStdString() << endl; + cout << "FindRight right : " << _strFind.toStdString() << endl; + if (QString::compare(str,_strFind,Qt::CaseInsensitive)==0) + { + return element; + } + } + QWebElement element; + return element; +} + +QWebElement SCrawler::FindLeft(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind) +{ + int _strStart = 0; + return FindMid(_FindElement, _strElement, _strAttrib, _strFind, _strStart); +} + + + +QList SCrawler::FindAllMid(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind, const int _strStart, const int _strLength) +{ + QWebElementCollection elements = _FindElement.findAll(_strElement); + QList returnElements = QList(); + + foreach (QWebElement element, elements) + { + QString str = element.attribute(_strAttrib).trimmed().mid(_strStart,_strLength); + if (QString::compare(str,_strFind,Qt::CaseInsensitive)==0) + { + returnElements.append(element); + } + } + return returnElements; +} diff --git a/CrawlerProcess/scrawler.h b/CrawlerProcess/scrawler.h index c96f47d..191ba9d 100644 --- a/CrawlerProcess/scrawler.h +++ b/CrawlerProcess/scrawler.h @@ -14,6 +14,11 @@ public: E_NAVER_BLOG_LIST, E_NAVER_BLOG_BODY, E_NAVER_BLOG_REPLY, + E_DAUM_CAFE_LIST, + E_DAUM_CAFE_DATA, + E_DAUM_BLOG_LIST, + E_DAUM_BLOG_BODY, + E_DAUM_BLOG_REPLY }; public: SCrawler(); @@ -46,8 +51,20 @@ private: void saveFrameUrl(QWebFrame *frame); void saveFrameComment(QWebFrame *frame); void saveFrameCafeUrl(QWebFrame *frame); + void saveFrameDaumBlogList(QWebFrame *frame); + void saveFrameDaumCafeList(QWebFrame *frame); + void saveFrameDaumBlogUrl(QWebFrame *frame); + void saveFrameDaumBlogComment(QWebFrame *frame); + void saveFrameDaumCafeUrl(QWebFrame *frame); int GetNumber(QString _str); + + QWebElement Find(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind); + QWebElement FindMid(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind, const int _strStart, const int _strLength); + QWebElement FindMid(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind, const int _strStart); + QWebElement FindRight(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind); + QWebElement FindLeft(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind); + QList FindAllMid(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind, const int _strStart, const int _strLength); QWebElementCollection Finds(const QWebElement _FindElement,const QString _strElement,const QString _strAttrib,const QString _strFind); void UpdateError(QString _strError); };