#include #include #include #include #include #include #include #include using namespace std; #include "sreplygetmanage.h" #include "srunnable.h" #include "data.h" extern void Debug(QString _strFilename,QString _strData); SReplyGetManage::SReplyGetManage() : m_nTotal(-1) { m_pool = new QThreadPool; } SReplyGetManage::~SReplyGetManage() { } void SReplyGetManage::SetUrl(QString _strUrl) { m_strUrl = _strUrl; QStringList strList = _strUrl.split("&"); QString strOid,strAid; { foreach(QString str ,strList) { QStringList strListData = str.split("="); if (strListData.size() == 2 ) { if (strListData.at(0) == "oid") strOid = strListData.at(1); if (strListData.at(0) == "aid") strAid = strListData.at(1); } } } m_strGno = "news" + strOid + "%2C" + strAid; //qDebug() << "m_strGno : " << m_strGno; } void SReplyGetManage::NaverNewsRun() { if (m_nTotal <= 0) return; QHostInfo info = QHostInfo::fromName("comment.news.naver.com"); QTcpSocket socket; socket.connectToHost(info.addresses().at(rand()%info.addresses().size()).toString(),80); if(!socket.waitForConnected()) { cout << "Error: " << socket.errorString().toStdString(); } QString strTotal = QString::number(m_nTotal); QString strParam = "pageSize="+strTotal+"&gno=" + m_strGno + "&serviceId=news&page=1"; socket.write(QString("POST /api/comment/list.json HTTP/1.1\r\n" "Host: comment.news.naver.com\r\n" "Connection: keep-alive\r\n" "Content-Length: "+QString::number(strParam.size())+"\r\n" "charset: utf-8\r\n" "Origin: http://comment.news.naver.com\r\n" "User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.124 Safari/537.36\r\n" "Content-Type: application/x-www-form-urlencoded; charset=UTF-8\r\n" "Accept: */*\r\n" "Accept-Encoding: deflate\r\n" "Accept-Language: ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4,zh-CN;q=0.2,zh;q=0.2\r\n\r\n"+strParam).toUtf8()); QByteArray byArray; while (socket.waitForReadyRead()) { byArray += socket.readAll(); } /* { QFile raw("c:\\data\\rawnews.json"); raw.open(QFile::WriteOnly | QFile::Text); QTextStream in(&raw); in << QString(byArray); raw.close(); } */ int index = byArray.indexOf("{"); byArray=byArray.mid(index-2); bool bFlag = true; QString strOut; while(bFlag) { strOut += byArray.left(8188); byArray=byArray.mid(8192); if (byArray.size() <= 8192) { bFlag = false; strOut += byArray; } } strOut = strOut.replace("\r\n","").replace("\n",""); { QRegExp re("\\}\\}[\\w\\s]*$"); strOut = strOut.replace(re,"}}"); } QJsonParseError error; QJsonDocument d = QJsonDocument::fromJson(strOut.toUtf8(),&error); if (error.error != 0) { cout << error.errorString().toStdString(); } m_pool->setMaxThreadCount(4); SReplyData *pReply = new SReplyData[m_nTotal]; int nCount = 0; bFlag = false; foreach(QJsonValue value ,d.object().value("message").toObject().value("result").toObject().value("commentReplies").toArray()) { QJsonObject obj = value.toObject(); pReply[nCount].m_nReplyReplyCount = obj["replyCount"].toInt(); //qDebug() << "m_nReplyReply" << pReply[nCount].m_nReplyReplyCount; int i= E_REPLY_USER_ID; while (i < E_REPLY_MAX) { if (i <= E_REPLY_CONTENT) pReply[nCount].m_strReplyData[i] = obj[g_strJsonReplyHead[i]].toString(); else { pReply[nCount].m_strReplyData[i] = QString::number(obj[g_strJsonReplyHead[i]].toInt()); } i++; } pReply[nCount].m_strReplyData[E_REPLY_CONTENT] += "\r\n"; for (int i = E_REPLY_COUNT_GOOD; i < E_REPLY_MAX ; i++) { pReply[nCount].m_strReplyData[E_REPLY_CONTENT] += "("; pReply[nCount].m_strReplyData[E_REPLY_CONTENT] += g_strJsonReplyHead[i]; pReply[nCount].m_strReplyData[E_REPLY_CONTENT] += ","; pReply[nCount].m_strReplyData[E_REPLY_CONTENT] += pReply[nCount].m_strReplyData[i]; pReply[nCount].m_strReplyData[E_REPLY_CONTENT] += ")\r\n"; } bFlag = pReply[nCount].m_strReplyData[E_REPLY_USER_ID].isEmpty(); if (pReply[nCount].m_nReplyReplyCount > 0 ) { SRunnable *pRun = new SRunnable(m_nSelect); QString strParam = "commentNo="; strParam += QString::number(obj["commentReplyNo"].toInt()); strParam += "&pageSize=100&gno="; strParam += m_strGno; strParam += "&serviceId=news"; pRun->SetParam(strParam,&pReply[nCount].m_strReplyReply); pRun->m_strID = QString::number(obj["commentReplyNo"].toInt()); pRun->setAutoDelete(true); m_pool->start(pRun); } nCount++; } m_pool->waitForDone(); // qDebug() << "m_nTotal = " << m_nTotal; for (int i = 0; i < m_nTotal ; i++) { m_pData->setData(m_pData->GetDate(pReply[i].m_strReplyData[E_REPLY_DATE]), SCrawlerData::ARTICLE_DATE); m_pData->setData(m_pData->SqlString(m_pData->GetSafeUtf(pReply[i].m_strReplyData[E_REPLY_CONTENT])), SCrawlerData::ARTICLE_DATA); m_pData->setData(m_pData->SqlString(m_pData->GetSafeUtf(pReply[i].m_strReplyData[E_REPLY_USER_NICKNAME])), SCrawlerData::ARTICLE_NICKNAME); m_pData->setData(m_pData->SqlString(m_pData->GetSafeUtf(pReply[i].m_strReplyData[E_REPLY_USER_ID])), SCrawlerData::ARTICLE_ID); m_pData->setData(m_pData->SqlString(m_pData->GetSafeUtf(pReply[i].m_strReplyReply.join("\r\n"))), SCrawlerData::ETC); m_pData->setData(QString::number(i), SCrawlerData::ARTICLE_ORDER); m_pData->sendDB(); } delete [] pReply; } void SReplyGetManage::DaumCafeRun() { for(int i = 0; i < E_DAUM_MAX; i++) if(m_astrDaum[i].isEmpty()) return; if (m_astrDaum[E_DAUM_DOWNSRC].indexOf("?") == -1) return; QMap mapData; foreach(QString str,m_astrDaum[E_DAUM_DOWNSRC].split("?").at(1).split("&")) { QStringList list = str.split("="); if (list.size() != 2) return; mapData.insert(list.at(0),list.at(1)); } int nTotal = m_astrDaum[E_DAUM_TOTAL].toInt() - 1; if (nTotal < 0) return; m_pool->setMaxThreadCount(8); QStringList *pList = new QStringList[nTotal]; for (int i = 0; i < nTotal ; i++) { QString strParam = "callCount=1\n" "page="+m_astrDaum[E_DAUM_DOWNSRC]+"&search=true\n" "httpSessionId=\n" "scriptSessionId=\n" "c0-scriptName=ShortComment\n" "c0-methodName=getList\n" "c0-id=0\n" "c0-param0=string:"+mapData["fldid"]+"\n" "c0-param1=string:"+mapData["datanum"]+"\n" "c0-param2=string:"+m_astrDaum[E_DAUM_TOTAL]+"\n" "c0-param3=number:"+QString::number(i+1)+ "\n" "c0-param4=string:"+m_astrDaum[E_DAUM_CDEPTH]+"\n" "c0-param5=null:null\n" "c0-param6=boolean:false\n" "c0-param7=boolean:false\n" "c0-param8=boolean:false\n" "c0-param9=boolean:false\n" "batchId=1\n"; QString strSend = QString("POST /_c21_/dwr/shortcomment/call/plaincall/ShortComment.getList.dwr HTTP/1.1\r\n" "Host: cafe.daum.net\r\n" "Connection: keep-alive\r\n" "Content-Length: "+QString::number(strParam.size())+"\r\n" "Origin: http://cafe.daum.net\r\n" "User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.130 Safari/537.36\r\n" "Content-Type: text/plain\r\n" "Accept: */*\r\n" "Referer: http://cafe.daum.net/"+m_astrDaum[E_DAUM_DOWNSRC]+"&search=true\r\n" "Accept-Encoding: gzip, deflate\r\n" "Accept-Language: ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4,zh-CN;q=0.2,zh;q=0.2\r\n\r\n") + strParam; SRunnable *pRun = new SRunnable(m_nSelect); pRun->SetParam(strSend,&pList[i]); pRun->m_strID = QString::number(i); pRun->setAutoDelete(true); m_pool->start(pRun); } m_pool->waitForDone(); for (int i = 0; i < nTotal ; i++) { int j = 0; while(j < pList[i].size()) { m_pData->setData(m_pData->SqlString(m_pData->GetSafeUtf(pList[i].at(j++))),SCrawlerData::ARTICLE_DATE); m_pData->setData(m_pData->SqlString(m_pData->GetSafeUtf(pList[i].at(j++))),SCrawlerData::ARTICLE_DATA); m_pData->setData(m_pData->SqlString(m_pData->GetSafeUtf(pList[i].at(j++))),SCrawlerData::ARTICLE_NICKNAME); m_pData->setData(m_pData->SqlString(m_pData->GetSafeUtf(pList[i].at(j++))),SCrawlerData::ARTICLE_PARENT); m_pData->setData(m_pData->SqlString(m_pData->GetSafeUtf(pList[i].at(j++))),SCrawlerData::ARTICLE_ID); m_pData->setData(m_pData->SqlString(m_pData->GetSafeUtf(pList[i].at(j++))),SCrawlerData::ARTICLE_ORDER); m_pData->sendDB(); } } delete [] pList; } void SReplyGetManage::Start(SCrawlerData *_pData) { _pData->setData("reply", SCrawlerData::ARTICLE_FORM); m_pData = _pData; switch(m_nSelect) { case E_NAVER_NEWS: NaverNewsRun(); break; case E_DAUM_CAFE: DaumCafeRun(); break; } }