Files
clients/AjaxCrawlerProcess/srunnable.cpp
admin d48ec1fbcb navernews 댓글 가져오기 수정
json형식 뒤에 이상한 문자들이 붙어있어서 오류를 일으킴

git-svn-id: svn://192.168.0.12/source@218 8346c931-da38-4b9b-9d4c-e48b93cbd075
2015-10-29 02:07:35 +00:00

277 lines
9.1 KiB
C++

#include <QJsonDocument>
#include <QJsonObject>
#include <QJsonValue>
#include <QJsonArray>
#include <QHostInfo>
#include <QWebPage>
#include <QWebFrame>
#include <QWebElement>
#include <QChar>
#include <QByteArray>
#include <QRegExp>
#include "srunnable.h"
#include "data.h"
extern void Debug(QString _strFilename,QString _strData);
SRunnable::SRunnable(int _nSelect)
{
m_pstrOut = 0;
m_nSelect = _nSelect;
}
SRunnable::~SRunnable()
{
}
void SRunnable::RunNaverNews()
{
QHostInfo info = QHostInfo::fromName("comment.news.naver.com");
QTcpSocket socket;
socket.connectToHost(info.addresses().at(rand()%info.addresses().size()).toString(),80);
if(!socket.waitForConnected())
{
qDebug() << "Error: " << socket.errorString();
}
socket.write(QString("POST /api/reply/list.json HTTP/1.1\r\n"
"Host: comment.news.naver.com\r\n"
"User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0\r\n"
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n"
"Accept-Language: ko-KR,ko;q=0.8,en-US;q=0.5,en;q=0.3\r\n"
"Accept-Encoding: deflate\r\n"
"Content-Type: application/x-www-form-urlencoded; charset=utf-8\r\n"
"charset: utf-8\r\n"
"Content-Length: " + QString::number(m_strParam.length()) + "\r\n"
"Connection: keep-alive\r\n"
"Pragma: no-cache\r\n"
"Cache-Control: no-cache\r\n\r\n" + m_strParam).toUtf8());
QByteArray byArray;
while (socket.waitForReadyRead())
{
byArray += socket.readAll();
}
QJsonDocument d;
{
QJsonParseError error;
int index = byArray.indexOf("{");
byArray=byArray.mid(index-2);
bool bFlag = true;
QString strOut;
while(bFlag)
{
strOut += byArray.left(8188);
byArray=byArray.mid(8192);
if (byArray.size() <= 8192)
{
bFlag = false;
strOut += byArray;
}
}
strOut = strOut.replace("\r\n","").replace("\n","");
{
QRegExp re("\\}\\}[\\w\\s]*$");
strOut = strOut.replace(re,"}}");
}
if (strOut.length() <= 0 ) return;
d = QJsonDocument::fromJson(strOut.trimmed().toUtf8(),&error);
if (error.error != 0)
{
qDebug() << error.errorString();
Debug("reply.json",strOut);
socket.close();
exit(0);
}
}
QString astrOut[E_REPLY_MAX];
foreach(QJsonValue value ,d.object().value("message").toObject().value("result").toObject().value("commentReplies").toArray())
{
QJsonObject obj = value.toObject();
int i= E_REPLY_USER_ID;
while (i < E_REPLY_MAX)
{
if (i <= E_REPLY_CONTENT)
astrOut[i] = obj[g_strJsonReplyHead[i]].toString();
else
astrOut[i] = QString::number(obj[g_strJsonReplyHead[i]].toInt());
i++;
}
for (i = 0; i < E_REPLY_MAX;i++)
{
QString strOut = g_strJsonReplyHead[i] + " : " + astrOut[i];
m_pstrOut->push_back(strOut);
}
m_pstrOut->push_back("");
}
socket.close();
}
QString CharToUnicode(QString _str)
{
QString str = _str.replace("\\u","");
QByteArray _byArray = str.toUtf8();
unsigned short ushCode = 0;
for (int i=0; i < 4 ; i++)
{
if(_byArray.at(i) >= '0' && _byArray.at(i) <= '9' )
ushCode |= (_byArray.at(i) - '0') << ((3-i)*4);
if(_byArray.at(i) >= 'A' && _byArray.at(i) <= 'F' )
ushCode |= (_byArray.at(i) - 'A' + 10) << ((3-i)*4);
}
return QString(QChar(ushCode));
}
QString StringToUniCode(QString _str)
{
QRegExp rx("(\\\\u[0-9a-fA-F]{4})");
int pos = 0;
while((pos = rx.indexIn(_str, pos))!= -1)
_str.replace(pos, rx.matchedLength(), CharToUnicode(rx.cap(1)));
return _str;
}
void SRunnable::RunDaumCafe()
{
QHostInfo info = QHostInfo::fromName("cafe.daum.net");
QTcpSocket socket;
socket.connectToHost(info.addresses().at(rand()%info.addresses().size()).toString(),80);
if(!socket.waitForConnected())
{
qDebug() << "Error: " << socket.errorString();
}
socket.write(m_strParam.toUtf8());
QByteArray byArray;
while (socket.waitForReadyRead())
byArray += socket.readAll();
QString strOut;
foreach (QByteArray byData ,byArray.split('\n'))
{
byData = byData.replace('\r',"");
if (byData.size() == 5 || byData.size() == 4 )
{
int i = 0;
foreach(QChar ch,byData.left(4))
{
if (ch.isDigit()) i++;
if (ch >= 'a' && ch <= 'f') i++;
}
if (i == 4) continue;
}
strOut += byData;
}
strOut=strOut.replace("\\t","\t").replace("\\r\\n","\r\n").replace("\\\"","\"").replace("\\'","'");
int nMode = -1;
bool bDate = false;
bool bContent = false;
int nCount = -1;
bool bReply = false;
QString strParent;
QString strPreviousID;
struct SData
{
QString m_strContent;
QString m_strNick;
QString m_strDate;
QString m_strParent;
QString m_strID;
};
SData astData[50];
//read line by line from ajax script
foreach(QString str,strOut.split("\n"))
{
str = str.trimmed();
if (str.isEmpty()) continue;
if (str.indexOf(QString("class=\"comment_pos\"")) > -1 ) {bReply = false;nMode = 0;nCount++;}
else if (str.indexOf(QString("class=\"recomment_pos recomment_first\"")) > -1 ) {bReply = true;nMode = 0;nCount++;}
else if (str.indexOf(QString("class=\"recomment_pos\"")) > -1) {bReply = true;nMode = 0;nCount++;}
if (bDate && nCount >= 0)
astData[nCount].m_strDate = str;
if (bContent && nCount >= 0)
astData[nCount].m_strContent = str;
bDate = (str.indexOf(QString("class=\"comment_date txt_sub p11 ls0\"")) > -1); // date is in next line
bContent = (str.indexOf(QString("class=\"comment_contents\"")) > -1); // content is in next line
// check id, nickname
if (str.indexOf(QString("onclick=\"showSideView(this")) > -1)
{
{
int nFirst = str.indexOf("'")+1;
int nLast = str.indexOf("'",nFirst);
if (nFirst > -1 && nLast > -1)
{
strPreviousID = astData[nCount].m_strID;
astData[nCount].m_strID = str.mid(nFirst,nLast-nFirst);
}
}
{
int nFirst = str.indexOf(QString("class=\"b\">")) + QString("class=\"b\">").length();
int nLast = str.indexOf(QString("</a>"));
if (nFirst > -1 && nLast > -1)
astData[nCount].m_strNick = str.mid(nFirst,nLast-nFirst);
if (bReply == false)
strParent = astData[nCount].m_strNick;
else
astData[nCount].m_strParent = strParent; // if sub reply, then put parent
}
}
// check sub sub reply
if (str.indexOf(QString("class=\"mention_nicknames text_counter txt_point b\">")) > -1 )
{
int nFirst = str.indexOf(QString("class=\"mention_nicknames text_counter txt_point b\">")) + QString("class=\"mention_nicknames text_counter txt_point b\">").length();
int nLast = str.indexOf(QString("</span>"));
nMode = 1;
astData[nCount].m_strParent = str.mid(nFirst,nLast-nFirst);
astData[nCount].m_strID = strPreviousID;
}
// nMode : 0-normal reply or reply reply. 1,2-reply reply reply. 2-contents
switch(nMode)
{
case 1:
if (str == "</a>") nMode = 2; break;
case 2:
astData[nCount].m_strContent = str;
nMode = 0;
break;
}
}
for(int i = 0; i < 50; i++)
{
if (astData[i].m_strDate.isEmpty()) continue;
if (astData[i].m_strNick.isEmpty()) continue;
m_pstrOut->push_back(astData[i].m_strDate.trimmed().replace(".","-").replace("- "," "));
m_pstrOut->push_back(StringToUniCode(astData[i].m_strContent));
m_pstrOut->push_back(StringToUniCode(astData[i].m_strNick));
m_pstrOut->push_back(StringToUniCode(astData[i].m_strParent));
m_pstrOut->push_back(StringToUniCode(astData[i].m_strID));
m_pstrOut->push_back(QString::number(m_strID.toInt()*50 + i));
}
socket.close();
}
void SRunnable::run()
{
switch(m_nSelect)
{
case E_NAVER_NEWS:RunNaverNews();break;
case E_DAUM_CAFE:RunDaumCafe();break;
}
}