update -> insert 방식으로 고침

git-svn-id: svn://192.168.0.12/source@104 8346c931-da38-4b9b-9d4c-e48b93cbd075
This commit is contained in:
admin
2015-05-11 06:31:04 +00:00
parent 938674a1d9
commit 7f02047720
4 changed files with 362 additions and 27 deletions

View File

@@ -42,6 +42,7 @@ void SCrawler::load(QStringList _strlistArgv)
m_strUrl = _strlistArgv[2];
m_nSelect = E_NAVER_CAFE_DATA;
m_strReper = _strlistArgv[4];
m_strKeywordID = _strlistArgv[5];
}
if (_strlistArgv[1] == "blog_list")
@@ -56,6 +57,7 @@ void SCrawler::load(QStringList _strlistArgv)
{
m_strUrl = _strlistArgv[2];
m_nSelect = E_NAVER_BLOG_BODY;
m_strKeywordID = _strlistArgv[4];
m_bUse = true;
}
@@ -83,6 +85,7 @@ void SCrawler::load(QStringList _strlistArgv)
m_strUrl = _strlistArgv[2];
m_nSelect = E_DAUM_CAFE_DATA;
m_strReper = _strlistArgv[4];
m_strKeywordID = _strlistArgv[5];
}
if (_strlistArgv[1] == "blog_list")
@@ -159,14 +162,34 @@ void SCrawler::saveResult(bool ok)
switch(m_nSelect)
{
case E_NAVER_CAFE_LIST:saveFrameCafeList(m_page->mainFrame());break;
case E_NAVER_CAFE_DATA:saveFrameCafeUrl(m_page->mainFrame());break;
case E_NAVER_CAFE_DATA:
{
saveFrameCafeUrl(m_page->mainFrame());
bodydata.sendDB();
break;
}
case E_NAVER_BLOG_LIST:saveFrameList(m_page->mainFrame());break;
case E_NAVER_BLOG_BODY:saveFrameUrl(m_page->mainFrame());break;
case E_NAVER_BLOG_BODY:
{
saveFrameUrl(m_page->mainFrame());
bodydata.sendDB();
break;
}
case E_NAVER_BLOG_REPLY:saveFrameComment(m_page->mainFrame());break;
case E_DAUM_CAFE_LIST:saveFrameDaumCafeList(m_page->mainFrame());break;
case E_DAUM_CAFE_DATA:saveFrameDaumCafeUrl(m_page->mainFrame());break;
case E_DAUM_CAFE_DATA:
{
saveFrameDaumCafeUrl(m_page->mainFrame());
bodydata.sendDB();
break;
}
case E_DAUM_BLOG_LIST:saveFrameDaumBlogList(m_page->mainFrame());break;
case E_DAUM_BLOG_BODY:saveFrameDaumBlogUrl(m_page->mainFrame());break;
case E_DAUM_BLOG_BODY:
{
saveFrameDaumBlogUrl(m_page->mainFrame());
bodydata.sendDB();
break;
}
case E_DAUM_BLOG_REPLY:saveFrameDaumBlogComment(m_page->mainFrame());break;
}
@@ -263,7 +286,10 @@ void SCrawler::saveFrameList(QWebFrame *frame)
QWebElement notFound = Find(frame->documentElement(),"div","id","notfound");
if(notFound.isNull() == false)
{
m_bLast = true;
return;
}
QWebElement eleMain = Find(frame->documentElement(),"div","class","blog section _blogBase");
QSqlQuery sql;
@@ -383,6 +409,7 @@ void SCrawler::saveFrameList(QWebFrame *frame)
else
strPlatformId = strUrl.split("/").at(0).split(".").at(0);
/*
QString strQuery = QString("insert into ");
strQuery += m_strTable;
strQuery += QString(" set platform_name='naver',platform_form='blog',article_form='body',article_url='%1',platform_id='%2',platform_title='%3',keyword_id='%4'").arg("http://"+strUrl).arg(strPlatformId).arg(str).arg(m_strKeywordID);
@@ -390,6 +417,7 @@ void SCrawler::saveFrameList(QWebFrame *frame)
if (sql.exec(strUtf8) == false)
cout << "error : " << sql.lastError().text().toStdString();
else
*/
cout << "o ";
}
//else
@@ -399,6 +427,7 @@ void SCrawler::saveFrameList(QWebFrame *frame)
}
{
QWebElement total = Find(eleMain,"span","class","title_num");
if (total.toPlainText().isEmpty()) {m_bError = true; return;}
int nTotal = GetNumber(total.toPlainText().split("/").at(1));
@@ -432,6 +461,9 @@ void SCrawler::saveFrameUrl(QWebFrame *frame)
QString str = profile.toPlainText().split("\n").at(0);
if (str.isEmpty() == false)
{
bodydata.setData(str, bodydata.ARTICLE_NICKNAME);
bodydata.setData(m_strUrl, bodydata.ARTICLE_URL);
/*
QString strQuery = "update " + m_strTable + " set article_nickname = '";
strQuery += str;
strQuery += "'";
@@ -445,6 +477,7 @@ void SCrawler::saveFrameUrl(QWebFrame *frame)
UpdateError("Error code 1");
m_bUse = false;
}
*/
}
}
/*
@@ -585,6 +618,36 @@ void SCrawler::saveFrameUrl(QWebFrame *frame)
}
}
//QString strHead[E_DATA_MAX] = {"article_nickname","article_id","article_title","article_date","article_data","platform_title"};
bodydata.setData(str[0].trimmed(), bodydata.ARTICLE_NICKNAME);
bodydata.setData(str[1].trimmed(), bodydata.ARTICLE_ID);
bodydata.setData(str[2].trimmed(), bodydata.ARTICLE_TITLE);
bodydata.setData(str[3].trimmed(), bodydata.ARTICLE_DATE);
bodydata.setData(str[4].trimmed(), bodydata.ARTICLE_DATA);
bodydata.setData(str[5].trimmed(), bodydata.PLATFORM_TITLE);
if(image.attribute("src").trimmed().length() != 0)
{
bodydata.setData(image.attribute("src").trimmed(), bodydata.ARTICLE_PROFILEURL);
}
strProfile = GetSafeUtf(strProfile);
if(strProfile.length() > 0)
{
bodydata.setData(strProfile, bodydata.ARTICLE_PROFILE);
}
bodydata.setData(m_strUrl, bodydata.ARTICLE_URL);
bodydata.setTable(m_strTable);
bodydata.setData("naver", bodydata.PLATFORM_NAME);
bodydata.setData("blog", bodydata.PLATFORM_FORM);
bodydata.setData("body", bodydata.ARTICLE_FORM);
bodydata.setData(m_strUrl.split("/").at(3), bodydata.PLATFORM_ID);
bodydata.setData(m_strKeywordID, bodydata.KEYWORD_ID);
/*
QString strQuery = "update " + m_strTable + " set ";
for(int i = 0; i < E_DATA_MAX ; i++)
{
@@ -623,6 +686,7 @@ void SCrawler::saveFrameUrl(QWebFrame *frame)
UpdateError("Error code 5");
m_bUse = false;
}
*/
}
foreach(QWebFrame *childFrame, frame->childFrames())
@@ -655,6 +719,17 @@ void SCrawler::saveFrameComment(QWebFrame *frame)
strId = strCommUrl.split("/").at(3).trimmed();
if(strCommUrl.right(QString("blog.me").length()).compare("blog.me") == 0)
strId = strCommUrl.split("/").at(2).split(".").at(0).trimmed();
if(strCommUrl.left(1) == "/")
{
QStringList strList = strCommUrl.split("&");
foreach(QString str, strList)
{
if(str.left(3) == "id=")
{
strId = str.right(str.length() - 3);
}
}
}
strComm = GetSafeUtf(strComm);
if (strComm.isEmpty()== false)
@@ -707,7 +782,16 @@ void SCrawler::saveFrameComment(QWebFrame *frame)
if(strCommUrl.right(QString("blog.me").length()).compare("blog.me") == 0)
strId = strCommUrl.split("/").at(2).split(".").at(0).trimmed();
if(strCommUrl.left(1) == "/")
strId = strUrl.split("/").at(3).trimmed();
{
QStringList strList = strCommUrl.split("&");
foreach(QString str, strList)
{
if(str.left(3) == "id=")
{
strId = str.right(str.length() - 3);
}
}
}
if(subNick.isEmpty() == false)
{
@@ -833,10 +917,11 @@ void SCrawler::saveFrameCafeList(QWebFrame *frame)
if (strUrl.split("/").at(2) == "cafe.naver.com")
{
QSqlQuery sql;
//QSqlQuery sql;
//if (sql.size() == 0 || sql.size() == -1)
{
/*
QString strQuery = QString("insert into ");
strQuery += m_strTable;
strQuery += QString(" set platform_name='naver',platform_form='cafe',article_form='body',article_url='%1',platform_id='%2',article_title='%3',keyword_id='%4'").arg(strUrl).arg(strUrl.split("/").at(3)).arg(strTitle).arg(m_strKeywordID);
@@ -844,6 +929,7 @@ void SCrawler::saveFrameCafeList(QWebFrame *frame)
if (sql.exec(strUtf8) == false)
cout << "x " << sql.lastError().text().toStdString();
else
*/
{
cout << "o " << strUrl.toStdString() << endl;
}
@@ -875,23 +961,14 @@ void SCrawler::saveFrameCafeUrl(QWebFrame *frame)
QWebElement other = Find(frame->documentElement(),"h1","class","d-none");
if (other.toPlainText().isEmpty() == false)
{
QString strQuery = "update ";
strQuery += m_strTable;
strQuery += " set ";
strQuery += "platform_title = '" + SqlString(GetSafeUtf(other.toPlainText())) + "'";
strQuery += "where article_url='";
strQuery += m_strUrl;
strQuery += "'";
QString strUtf8(strQuery.toUtf8());
QSqlQuery sql;
if (sql.exec(strUtf8) == false)
cout << "error : " << sql.lastError().text().toStdString();
bodydata.setData(SqlString(GetSafeUtf(other.toPlainText())), bodydata.PLATFORM_TITLE);
}
if (frame->frameName() == "cafe_main")
{
{
QString strData,strDate,strNick,strID,strHits;
QString strData,strDate,strNick,strID,strHits,strTitle;
{
QWebElement group = Find(frame->documentElement(),"div","class","tbody m-tcol-c");
strData = SqlString(group.toPlainText().trimmed());
@@ -909,6 +986,10 @@ void SCrawler::saveFrameCafeUrl(QWebFrame *frame)
else
strDate += ":00";
}
{
QWebElement group = Find(frame->documentElement(),"span","class","b m-tcol-c");
strTitle = SqlString(group.toPlainText().trimmed());
}
{
QWebElement group = Find(Find(frame->documentElement(),"div","class","etc-box"),"td","class","p-nick");
@@ -941,6 +1022,21 @@ void SCrawler::saveFrameCafeUrl(QWebFrame *frame)
strHits = Find(frame->documentElement(),"span","class","kin_count m-tcol-c _rosReadcount").toPlainText();
}
{
bodydata.setTable(m_strTable);
bodydata.setData(strData, bodydata.ARTICLE_DATA);
bodydata.setData(strDate, bodydata.ARTICLE_DATE);
bodydata.setData(strNick, bodydata.ARTICLE_NICKNAME);
bodydata.setData(strID, bodydata.ARTICLE_ID);
bodydata.setData(strHits, bodydata.ARTICLE_HIT);
bodydata.setData(m_strUrl, bodydata.ARTICLE_URL);
bodydata.setData("naver", bodydata.PLATFORM_NAME);
bodydata.setData("cafe", bodydata.PLATFORM_FORM);
bodydata.setData("body", bodydata.ARTICLE_FORM);
bodydata.setData(m_strUrl.split("/").at(3), bodydata.PLATFORM_ID);
bodydata.setData(m_strKeywordID, bodydata.KEYWORD_ID);
bodydata.setData(strTitle, bodydata.ARTICLE_TITLE);
/*
QSqlQuery sql;
QString strQuery = "update ";
strQuery += m_strTable;
@@ -956,6 +1052,8 @@ void SCrawler::saveFrameCafeUrl(QWebFrame *frame)
QString strUtf8(strQuery.toUtf8());
if (sql.exec(strUtf8) == false)
cout << "error : " << sql.lastError().text().toStdString();
*/
}
}
// Comment
@@ -1075,7 +1173,7 @@ void SCrawler::saveFrameDaumCafeList(QWebFrame *frame)
QString strQuery = "delete from ";
strQuery += m_strTable;
strQuery += QString(" where article_url in %1").arg(strUrlList);
qDebug() << strQuery;
// qDebug() << strQuery;
if (sql.exec(strQuery.toUtf8()) == false)
{
@@ -1103,9 +1201,10 @@ void SCrawler::saveFrameDaumCafeList(QWebFrame *frame)
if (strUrl.split("/").at(2) == "cafe.daum.net")
{
QSqlQuery sql;
//QSqlQuery sql;
{
/*
QString strQuery = QString("insert into ");
strQuery += m_strTable;
strQuery += QString(" set platform_name='daum',platform_form='cafe',article_form='body',article_url='%1',platform_id='%2',article_title='%3',keyword_id='%4'").arg(strUrl).arg(strUrl.split("/").at(3)).arg(strTitle).arg(m_strKeywordID);
@@ -1113,6 +1212,7 @@ void SCrawler::saveFrameDaumCafeList(QWebFrame *frame)
if (sql.exec(strUtf8) == false)
cout << "x " << sql.lastError().text().toStdString();
else
*/
cout << "o " << strUrl.toStdString() << endl;
}
//else
@@ -1133,18 +1233,43 @@ void SCrawler::saveFrameDaumCafeList(QWebFrame *frame)
m_bLast = true;
}
{
QWebElement noResult = Find(frame->documentElement(),"div","id","noResult");
if(!noResult.isNull())
{
m_bLast = true;
return;
}
}
{
bool b_last = false;
b_last = Find(frame->documentElement(), "div", "class", "result_message mg_cont hide").isNull();
b_last = b_last | !(Find(frame->documentElement(), "div", "class", "result_message mg_cont").isNull());
QWebElement total = Find(eleMain,"span","class","f_nb f_l");
if (total.toPlainText().isEmpty()) {m_bError = true; return;}
total.toPlainText().split("/").size();
QString strTotal = total.toPlainText().split("/").at(1);
strTotal = strTotal.replace(",","");
QRegExp rx("(\\d+)");
int pos = 0;
QList<QString> list;
while ((pos = rx.indexIn(strTotal, pos)) != -1)
{
list << rx.cap(1);
pos += rx.matchedLength();
}
int nTotal = list.at(0).toInt();
QStringList strList = total.toPlainText().split("/").at(0).trimmed().split("-");
int nNow = GetNumber(strList.at(strList.size() - 1));
int nNowFirst = GetNumber(strList.at(strList.size() - 2));
if (nNow >= 1000 || (nNow - nNowFirst) < 9 || b_last)
if (nNow >= 1000 || nNow >= nTotal || (nNow - nNowFirst) < 9 || b_last)
m_bLast = true;
//cout << "nNow : " << nNow << endl << "nNow - nNowFirst: " << (nNow - nNowFirst) << endl << "b_last : " << b_last << endl;
}
@@ -1161,6 +1286,11 @@ void SCrawler::saveFrameDaumCafeUrl(QWebFrame *frame)
if (strTitle.isEmpty() == false)
{
bodydata.setTable(m_strTable);
bodydata.setData(m_strUrl, bodydata.ARTICLE_URL);
bodydata.setData(SqlString(GetSafeUtf(strTitle)), bodydata.PLATFORM_TITLE);
/*
QString strQuery = "update ";
strQuery += m_strTable;
strQuery += " set ";
@@ -1172,6 +1302,7 @@ void SCrawler::saveFrameDaumCafeUrl(QWebFrame *frame)
QSqlQuery sql;
if (sql.exec(strUtf8) == false)
cout << "error : " << sql.lastError().text().toStdString();
*/
}
if (frame->frameName() == "down")
@@ -1179,7 +1310,7 @@ void SCrawler::saveFrameDaumCafeUrl(QWebFrame *frame)
QString strHits;
{
//QString strData,strDate,strNick,strID,strHits;
QString strData,strDate,strNick,strID;
QString strData,strDate,strNick,strID,strTitle;
{
QWebElement group = Find(frame->documentElement(),"div","class","bbs_contents");
strData = SqlString(group.toPlainText().trimmed());
@@ -1199,6 +1330,11 @@ void SCrawler::saveFrameDaumCafeUrl(QWebFrame *frame)
else
strDate += ":00";
}
{
QWebElement group = Find(frame->documentElement(),"div","class","subject");
QWebElement group2 = Find(group,"span","class","b");
strTitle = SqlString(group2.toPlainText().trimmed());
}
{
QWebElement group = Find(Find(frame->documentElement(),"div","class","article_writer"),"a","href","#");
@@ -1231,6 +1367,22 @@ void SCrawler::saveFrameDaumCafeUrl(QWebFrame *frame)
}
*/
{
bodydata.setTable(m_strTable);
bodydata.setData(strData, bodydata.ARTICLE_DATA);
bodydata.setData(strDate, bodydata.ARTICLE_DATE);
bodydata.setData(strNick, bodydata.ARTICLE_NICKNAME);
if(!strID.isEmpty())
bodydata.setData(strID, bodydata.ARTICLE_ID);
bodydata.setData(strHits, bodydata.ARTICLE_HIT);
bodydata.setData(m_strUrl, bodydata.ARTICLE_URL);
bodydata.setData("daum", bodydata.PLATFORM_NAME);
bodydata.setData("cafe", bodydata.PLATFORM_FORM);
bodydata.setData("body", bodydata.ARTICLE_FORM);
bodydata.setData(m_strUrl.split("/").at(3), bodydata.PLATFORM_ID);
bodydata.setData(m_strKeywordID, bodydata.KEYWORD_ID);
bodydata.setData(strTitle, bodydata.ARTICLE_TITLE);
/*
QSqlQuery sql;
QString strQuery = "update ";
strQuery += m_strTable;
@@ -1247,6 +1399,7 @@ void SCrawler::saveFrameDaumCafeUrl(QWebFrame *frame)
QString strUtf8(strQuery.toUtf8());
if (sql.exec(strUtf8) == false)
cout << "error : " << sql.lastError().text().toStdString();
*/
}
}
// Comment
@@ -1498,7 +1651,7 @@ void SCrawler::setProxy()
cout << "p : " << strList.at(0).toStdString() << endl;
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt())));
//QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,"101.69.199.99",80)));
//QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,"196.201.216.172",8088)));
break;
}
}

View File

@@ -2,7 +2,7 @@
#define SCRAWLER_H
#include <QtWebKitWidgets>
#include "scrawlerdata.h"
class SCrawler : public QObject
{
Q_OBJECT
@@ -36,6 +36,8 @@ private:
QString m_strReper;
QString m_strKeywordID;
SCrawlerData bodydata;
QWebPage *m_page;
QString m_strFile;
QString m_strUrl;

View File

@@ -0,0 +1,126 @@
#include "scrawlerdata.h"
#include <QSqlQuery>
#include <iostream>
#include <QVariant>
#include <QSqlDatabase>
#include <QSqlError>
using namespace std;
SCrawlerData::SCrawlerData()
{
//platform_name,platform_form,platform_title,article_form,article_parent,article_id,article_nickname,article_title,article_data,article_url,article_hit,article_date,article_order,article_profile,article_profileurl,platform_id,keyword_id,reply_url
m_strColumn[ARTICLE_DATA] = "article_data";
m_strColumn[ARTICLE_DATE] = "article_date";
m_strColumn[ARTICLE_FORM] = "article_form";
m_strColumn[ARTICLE_HIT] = "article_hit";
m_strColumn[ARTICLE_ID] = "article_id";
m_strColumn[ARTICLE_NICKNAME] = "article_nickname";
m_strColumn[ARTICLE_ORDER] = "article_order";
m_strColumn[ARTICLE_PARENT] = "article_parent";
m_strColumn[ARTICLE_PROFILE] = "article_profile";
m_strColumn[ARTICLE_PROFILEURL] = "article_profileurl";
m_strColumn[ARTICLE_TITLE] = "article_title";
m_strColumn[ARTICLE_URL] = "article_url";
m_strColumn[KEYWORD_ID] = "keyword_id";
m_strColumn[PLATFORM_FORM] = "platform_form";
m_strColumn[PLATFORM_ID] = "platform_id";
m_strColumn[PLATFORM_NAME] = "platform_name";
m_strColumn[PLATFORM_TITLE] = "platform_title";
m_strColumn[REPLY_URL] = "reply_url";
}
SCrawlerData::~SCrawlerData()
{
clear();
for(int i = 0; i < TOTAL_COUNT; i++)
{
m_strColumn[i].clear();
}
}
void SCrawlerData::clear()
{
for(int i = 0; i < TOTAL_COUNT; i++)
{
m_strData[i].clear();
}
}
void SCrawlerData::clear(int _num)
{
m_strData[_num].clear();
}
QString SCrawlerData::getData(int _num)
{
return m_strData[_num];
}
void SCrawlerData::setTable(QString _str)
{
m_strTable = _str;
}
void SCrawlerData::setData(QString _str, int _num)
{
m_strData[_num] = _str;
}
bool SCrawlerData::sendDB()
{
QSqlQuery query;
QString strQuery;
strQuery = "insert into " + m_strTable + "(";
for(int i = 0; i < TOTAL_COUNT; i++)
{
strQuery += (m_strColumn[i] + ",");
}
strQuery = strQuery.left(strQuery.size() - 1);
strQuery += ") VALUES (";
for(int i = 0; i < TOTAL_COUNT; i++)
{
strQuery += (":" + m_strColumn[i] + ",");
}
strQuery = strQuery.left(strQuery.size() - 1);
strQuery += ")";
query.prepare(strQuery.toUtf8());
for(int i = 0; i < TOTAL_COUNT; i++)
{
if(i == ARTICLE_ORDER)
query.bindValue(QString(":" + m_strColumn[i]), m_strData[i].trimmed().toInt());
query.bindValue(QString(":" + m_strColumn[i]), m_strData[i].trimmed().toUtf8());
}
if (query.exec()==false)
{
cout << "error : " << query.lastError().text().toStdString();
return false;
}
return true;
}
QString SCrawlerData::GetSafeUtf(QString _strData)
{
QString str;
QChar *pch = _strData.data();
for (int i = 0; i < _strData.length(); i++)
{
if (pch[i].unicode() >= 12593 && pch[i].unicode() <= 12622)
str += pch[i];
if (pch[i].unicode() >= 44032 && pch[i].unicode() <= 55203)
str += pch[i];
if (pch[i].isDigit() || pch[i].isNumber() || pch[i].isSpace() || pch[i].isLower() || pch[i].isUpper() || pch[i].isSymbol() )
str += pch[i];
}
return str;
}

View File

@@ -0,0 +1,54 @@
#ifndef SCRAWLERDATA
#define SCRAWLERDATA
#endif // SCRAWLERDATA
#include <QString>
#include <QStringList>
class SCrawlerData
{
public:
enum E_COLUMN
{
PLATFORM_NAME = 0,
PLATFORM_FORM,
PLATFORM_TITLE,
ARTICLE_FORM,
ARTICLE_PARENT,
ARTICLE_ID,
ARTICLE_NICKNAME,
ARTICLE_TITLE,
ARTICLE_DATA,
ARTICLE_URL,
ARTICLE_HIT,
ARTICLE_DATE,
ARTICLE_ORDER,
ARTICLE_PROFILE,
ARTICLE_PROFILEURL,
PLATFORM_ID,
KEYWORD_ID,
REPLY_URL,
TOTAL_COUNT
};
private:
QString m_strData[TOTAL_COUNT];
QString m_strColumn[TOTAL_COUNT];
QString m_strTable;
private:
QString GetSafeUtf(QString _strData);
QString getTable();
public:
SCrawlerData();
~SCrawlerData();
QString getData(int _num);
void setData(QString _str, int _num);
void clear();
void clear(int _num);
bool sendDB();
void setTable(QString _str);
};