Merge branch 'working/twitter'
This commit is contained in:
@@ -28,7 +28,11 @@ SOURCES += main.cpp\
|
||||
skakaousermanage.cpp \
|
||||
sfacebooktagmanage.cpp \
|
||||
sfacebookusermanage.cpp \
|
||||
snaverblogaccuracymanager.cpp
|
||||
snaverblogaccuracymanager.cpp \
|
||||
stwittertagmanage.cpp \
|
||||
stwitterusermanage.cpp \
|
||||
syoutubetagmanage.cpp \
|
||||
syoutubeusermanage.cpp
|
||||
|
||||
HEADERS += widget.h \
|
||||
smanage.h \
|
||||
@@ -45,5 +49,9 @@ HEADERS += widget.h \
|
||||
skakaousermanage.h \
|
||||
sfacebooktagmanage.h \
|
||||
sfacebookusermanage.h \
|
||||
snaverblogaccuracymanage.h
|
||||
snaverblogaccuracymanage.h \
|
||||
stwittertagmanage.h \
|
||||
stwitterusermanage.h \
|
||||
syoutubetagmanage.h \
|
||||
syoutubeusermanage.h
|
||||
|
||||
|
||||
89
CrawlerList/stwittertagmanage.cpp
Normal file
89
CrawlerList/stwittertagmanage.cpp
Normal file
@@ -0,0 +1,89 @@
|
||||
#include "stwittertagmanage.h"
|
||||
#include <QThread>
|
||||
#include "widget.h"
|
||||
STwitterTagManage::STwitterTagManage(QObject *pObject) : SManage(pObject)
|
||||
{
|
||||
m_nID = 0;
|
||||
connect(&m_pro[0], SIGNAL(readyReadStandardOutput()), this, SLOT(readStandardOutput()));
|
||||
connect(&m_pro[0], SIGNAL(readyReadStandardError()), this, SLOT(readStandardError()));
|
||||
}
|
||||
|
||||
|
||||
void STwitterTagManage::Start()
|
||||
{
|
||||
m_nMode = E_PROCESS_RUN;
|
||||
m_bFinalLast = false;
|
||||
}
|
||||
|
||||
bool STwitterTagManage::Update()
|
||||
{
|
||||
if(m_bFinalLast) return m_bFinalLast;
|
||||
switch(m_nMode)
|
||||
{
|
||||
case E_PROCESS_RUN:
|
||||
if(UseProcess() == false)
|
||||
{
|
||||
#if defined(Q_OS_WIN32)
|
||||
m_pro[0].start("python", QStringList() << "webbasedcrawler.py" << "twitter" << m_strKeywordID << m_strGroupID << m_pMain->StartDay() << m_pMain->UntilPage());
|
||||
#else
|
||||
m_pro[0].start("/usr/bin/python3", QStringList() << "webbasedcrawler.py" << "twitter" << m_strKeywordID << m_strGroupID << m_pMain->StartDay() << m_pMain->UntilPage());
|
||||
#endif
|
||||
m_nMode = E_PROCESS_FINISH_WAIT;
|
||||
}
|
||||
break;
|
||||
|
||||
case E_PROCESS_FINISH_WAIT:
|
||||
break;
|
||||
}
|
||||
|
||||
return m_bFinalLast;
|
||||
}
|
||||
|
||||
void STwitterTagManage::processFinished(QProcess *pPro, QString _strOut)
|
||||
{
|
||||
switch(m_nMode)
|
||||
{
|
||||
case E_PROCESS_FINISH_WAIT:
|
||||
m_nMode = E_PROCESS_RUN;
|
||||
m_bFinalLast = true;
|
||||
m_pMain->InsertLog("Finish Crawling :)");
|
||||
m_pMain->SetCrawlingState("Finish");
|
||||
m_ncList=1;
|
||||
m_bLast = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void STwitterTagManage::readStandardOutput()
|
||||
{
|
||||
QProcess *pPro = (QProcess*)sender();
|
||||
QThread::msleep(100);
|
||||
QString str = pPro->readAllStandardOutput();
|
||||
QStringList list = str.split("\n", QString::SkipEmptyParts);
|
||||
foreach(QString log,list)
|
||||
{
|
||||
if (m_pMain)
|
||||
{
|
||||
m_pMain->InsertLog(log);
|
||||
}
|
||||
else
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
void STwitterTagManage::readStandardError()
|
||||
{
|
||||
QProcess *pPro = (QProcess*)sender();
|
||||
QThread::msleep(100);
|
||||
QString str = pPro->readAllStandardError();
|
||||
QStringList list = str.split("\n", QString::SkipEmptyParts);
|
||||
foreach(QString log,list)
|
||||
{
|
||||
if (m_pMain)
|
||||
{
|
||||
m_pMain->InsertLog(log);
|
||||
}
|
||||
else
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
30
CrawlerList/stwittertagmanage.h
Normal file
30
CrawlerList/stwittertagmanage.h
Normal file
@@ -0,0 +1,30 @@
|
||||
#ifndef STWITTERTAGMANAGE_H
|
||||
#define STWITTERTAGMANAGE_H
|
||||
#include "smanage.h"
|
||||
|
||||
class STwitterTagManage : public SManage
|
||||
{
|
||||
Q_OBJECT
|
||||
public:
|
||||
enum E_PROCESS_STATE
|
||||
{
|
||||
E_PROCESS_RUN = 0,
|
||||
E_PROCESS_FINISH_WAIT,
|
||||
};
|
||||
STwitterTagManage(QObject *pObject);
|
||||
private:
|
||||
QString makeGetListQuery(QString _str,QDate _date,int _nPage);
|
||||
private:
|
||||
QString m_strListQuery;
|
||||
QVector <QString> m_strListURL;
|
||||
protected:
|
||||
bool Update();
|
||||
void Start();
|
||||
void processFinished(QProcess *pPro,QString _strOut);
|
||||
void ReLoadList();
|
||||
private slots:
|
||||
void readStandardOutput();
|
||||
void readStandardError();
|
||||
};
|
||||
#endif // STWITTERTAGMANAGE_H
|
||||
|
||||
89
CrawlerList/stwitterusermanage.cpp
Normal file
89
CrawlerList/stwitterusermanage.cpp
Normal file
@@ -0,0 +1,89 @@
|
||||
#include "stwitterusermanage.h"
|
||||
#include <QThread>
|
||||
#include "widget.h"
|
||||
STwitterUserManage::STwitterUserManage(QObject *pObject) : SManage(pObject)
|
||||
{
|
||||
m_nID = 0;
|
||||
connect(&m_pro[0], SIGNAL(readyReadStandardOutput()), this, SLOT(readStandardOutput()));
|
||||
connect(&m_pro[0], SIGNAL(readyReadStandardError()), this, SLOT(readStandardError()));
|
||||
}
|
||||
|
||||
|
||||
void STwitterUserManage::Start()
|
||||
{
|
||||
m_nMode = E_PROCESS_RUN;
|
||||
m_bFinalLast = false;
|
||||
}
|
||||
|
||||
bool STwitterUserManage::Update()
|
||||
{
|
||||
if(m_bFinalLast) return m_bFinalLast;
|
||||
switch(m_nMode)
|
||||
{
|
||||
case E_PROCESS_RUN:
|
||||
if(UseProcess() == false)
|
||||
{
|
||||
#if defined(Q_OS_WIN32)
|
||||
m_pro[0].start("python", QStringList() << "webbasedcrawler.py" << "twitter" << m_strKeywordID << m_strGroupID << m_pMain->StartDay() << m_pMain->UntilPage());
|
||||
#else
|
||||
m_pro[0].start("/usr/bin/python3", QStringList() << "webbasedcrawler.py" << "twitter" << m_strKeywordID << m_strGroupID << m_pMain->StartDay() << m_pMain->UntilPage());
|
||||
#endif
|
||||
m_nMode = E_PROCESS_FINISH_WAIT;
|
||||
}
|
||||
break;
|
||||
|
||||
case E_PROCESS_FINISH_WAIT:
|
||||
break;
|
||||
}
|
||||
|
||||
return m_bFinalLast;
|
||||
}
|
||||
|
||||
void STwitterUserManage::processFinished(QProcess *pPro, QString _strOut)
|
||||
{
|
||||
switch(m_nMode)
|
||||
{
|
||||
case E_PROCESS_FINISH_WAIT:
|
||||
m_nMode = E_PROCESS_RUN;
|
||||
m_bFinalLast = true;
|
||||
m_pMain->InsertLog("Finish Crawling :)");
|
||||
m_pMain->SetCrawlingState("Finish");
|
||||
m_ncList=1;
|
||||
m_bLast = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void STwitterUserManage::readStandardOutput()
|
||||
{
|
||||
QProcess *pPro = (QProcess*)sender();
|
||||
QThread::msleep(100);
|
||||
QString str = pPro->readAllStandardOutput();
|
||||
QStringList list = str.split("\n", QString::SkipEmptyParts);
|
||||
foreach(QString log,list)
|
||||
{
|
||||
if (m_pMain)
|
||||
{
|
||||
m_pMain->InsertLog(log);
|
||||
}
|
||||
else
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
void STwitterUserManage::readStandardError()
|
||||
{
|
||||
QProcess *pPro = (QProcess*)sender();
|
||||
QThread::msleep(100);
|
||||
QString str = pPro->readAllStandardError();
|
||||
QStringList list = str.split("\n", QString::SkipEmptyParts);
|
||||
foreach(QString log,list)
|
||||
{
|
||||
if (m_pMain)
|
||||
{
|
||||
m_pMain->InsertLog(log);
|
||||
}
|
||||
else
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
30
CrawlerList/stwitterusermanage.h
Normal file
30
CrawlerList/stwitterusermanage.h
Normal file
@@ -0,0 +1,30 @@
|
||||
#ifndef STWITTERUSERMANAGE_H
|
||||
#define STWITTERUSERMANAGE_H
|
||||
#include "smanage.h"
|
||||
|
||||
class STwitterUserManage : public SManage
|
||||
{
|
||||
Q_OBJECT
|
||||
public:
|
||||
enum E_PROCESS_STATE
|
||||
{
|
||||
E_PROCESS_RUN = 0,
|
||||
E_PROCESS_FINISH_WAIT,
|
||||
};
|
||||
STwitterUserManage(QObject *pObject);
|
||||
private:
|
||||
QString makeGetListQuery(QString _str,QDate _date,int _nPage);
|
||||
private:
|
||||
QString m_strListQuery;
|
||||
QVector <QString> m_strListURL;
|
||||
protected:
|
||||
bool Update();
|
||||
void Start();
|
||||
void processFinished(QProcess *pPro,QString _strOut);
|
||||
void ReLoadList();
|
||||
private slots:
|
||||
void readStandardOutput();
|
||||
void readStandardError();
|
||||
};
|
||||
#endif // STWITTERUSERMANAGE_H
|
||||
|
||||
89
CrawlerList/syoutubetagmanage.cpp
Normal file
89
CrawlerList/syoutubetagmanage.cpp
Normal file
@@ -0,0 +1,89 @@
|
||||
#include "syoutubetagmanage.h"
|
||||
#include <QThread>
|
||||
#include "widget.h"
|
||||
SYoutubeTagManage::SYoutubeTagManage(QObject *pObject) : SManage(pObject)
|
||||
{
|
||||
m_nID = 0;
|
||||
connect(&m_pro[0], SIGNAL(readyReadStandardOutput()), this, SLOT(readStandardOutput()));
|
||||
connect(&m_pro[0], SIGNAL(readyReadStandardError()), this, SLOT(readStandardError()));
|
||||
}
|
||||
|
||||
|
||||
void SYoutubeTagManage::Start()
|
||||
{
|
||||
m_nMode = E_PROCESS_RUN;
|
||||
m_bFinalLast = false;
|
||||
}
|
||||
|
||||
bool SYoutubeTagManage::Update()
|
||||
{
|
||||
if(m_bFinalLast) return m_bFinalLast;
|
||||
switch(m_nMode)
|
||||
{
|
||||
case E_PROCESS_RUN:
|
||||
if(UseProcess() == false)
|
||||
{
|
||||
#if defined(Q_OS_WIN32)
|
||||
m_pro[0].start("python", QStringList() << "webbasedcrawler.py" << "youtube" << m_strKeywordID << m_strGroupID << m_pMain->StartDay() << m_pMain->UntilPage());
|
||||
#else
|
||||
m_pro[0].start("/usr/bin/python3", QStringList() << "webbasedcrawler.py" << "youtube" << m_strKeywordID << m_strGroupID << m_pMain->StartDay() << m_pMain->UntilPage());
|
||||
#endif
|
||||
m_nMode = E_PROCESS_FINISH_WAIT;
|
||||
}
|
||||
break;
|
||||
|
||||
case E_PROCESS_FINISH_WAIT:
|
||||
break;
|
||||
}
|
||||
|
||||
return m_bFinalLast;
|
||||
}
|
||||
|
||||
void SYoutubeTagManage::processFinished(QProcess *pPro, QString _strOut)
|
||||
{
|
||||
switch(m_nMode)
|
||||
{
|
||||
case E_PROCESS_FINISH_WAIT:
|
||||
m_nMode = E_PROCESS_RUN;
|
||||
m_bFinalLast = true;
|
||||
m_pMain->InsertLog("Finish Crawling :)");
|
||||
m_pMain->SetCrawlingState("Finish");
|
||||
m_ncList=1;
|
||||
m_bLast = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void SYoutubeTagManage::readStandardOutput()
|
||||
{
|
||||
QProcess *pPro = (QProcess*)sender();
|
||||
QThread::msleep(100);
|
||||
QString str = pPro->readAllStandardOutput();
|
||||
QStringList list = str.split("\n", QString::SkipEmptyParts);
|
||||
foreach(QString log,list)
|
||||
{
|
||||
if (m_pMain)
|
||||
{
|
||||
m_pMain->InsertLog(log);
|
||||
}
|
||||
else
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
void SYoutubeTagManage::readStandardError()
|
||||
{
|
||||
QProcess *pPro = (QProcess*)sender();
|
||||
QThread::msleep(100);
|
||||
QString str = pPro->readAllStandardError();
|
||||
QStringList list = str.split("\n", QString::SkipEmptyParts);
|
||||
foreach(QString log,list)
|
||||
{
|
||||
if (m_pMain)
|
||||
{
|
||||
m_pMain->InsertLog(log);
|
||||
}
|
||||
else
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
30
CrawlerList/syoutubetagmanage.h
Normal file
30
CrawlerList/syoutubetagmanage.h
Normal file
@@ -0,0 +1,30 @@
|
||||
#ifndef STYOUTUBETAGMANAGE_H
|
||||
#define STYOUTUBETAGMANAGE_H
|
||||
#include "smanage.h"
|
||||
|
||||
class SYoutubeTagManage : public SManage
|
||||
{
|
||||
Q_OBJECT
|
||||
public:
|
||||
enum E_PROCESS_STATE
|
||||
{
|
||||
E_PROCESS_RUN = 0,
|
||||
E_PROCESS_FINISH_WAIT,
|
||||
};
|
||||
SYoutubeTagManage(QObject *pObject);
|
||||
private:
|
||||
QString makeGetListQuery(QString _str,QDate _date,int _nPage);
|
||||
private:
|
||||
QString m_strListQuery;
|
||||
QVector <QString> m_strListURL;
|
||||
protected:
|
||||
bool Update();
|
||||
void Start();
|
||||
void processFinished(QProcess *pPro,QString _strOut);
|
||||
void ReLoadList();
|
||||
private slots:
|
||||
void readStandardOutput();
|
||||
void readStandardError();
|
||||
};
|
||||
#endif // STYOUTUBETAGMANAGE_H
|
||||
|
||||
89
CrawlerList/syoutubeusermanage.cpp
Normal file
89
CrawlerList/syoutubeusermanage.cpp
Normal file
@@ -0,0 +1,89 @@
|
||||
#include "syoutubeusermanage.h"
|
||||
#include <QThread>
|
||||
#include "widget.h"
|
||||
SYoutubeUserManage::SYoutubeUserManage(QObject *pObject) : SManage(pObject)
|
||||
{
|
||||
m_nID = 0;
|
||||
connect(&m_pro[0], SIGNAL(readyReadStandardOutput()), this, SLOT(readStandardOutput()));
|
||||
connect(&m_pro[0], SIGNAL(readyReadStandardError()), this, SLOT(readStandardError()));
|
||||
}
|
||||
|
||||
|
||||
void SYoutubeUserManage::Start()
|
||||
{
|
||||
m_nMode = E_PROCESS_RUN;
|
||||
m_bFinalLast = false;
|
||||
}
|
||||
|
||||
bool SYoutubeUserManage::Update()
|
||||
{
|
||||
if(m_bFinalLast) return m_bFinalLast;
|
||||
switch(m_nMode)
|
||||
{
|
||||
case E_PROCESS_RUN:
|
||||
if(UseProcess() == false)
|
||||
{
|
||||
#if defined(Q_OS_WIN32)
|
||||
m_pro[0].start("python", QStringList() << "webbasedcrawler.py" << "youtube" << m_strKeywordID << m_strGroupID << m_pMain->StartDay() << m_pMain->UntilPage());
|
||||
#else
|
||||
m_pro[0].start("/usr/bin/python3", QStringList() << "webbasedcrawler.py" << "youtube" << m_strKeywordID << m_strGroupID << m_pMain->StartDay() << m_pMain->UntilPage());
|
||||
#endif
|
||||
m_nMode = E_PROCESS_FINISH_WAIT;
|
||||
}
|
||||
break;
|
||||
|
||||
case E_PROCESS_FINISH_WAIT:
|
||||
break;
|
||||
}
|
||||
|
||||
return m_bFinalLast;
|
||||
}
|
||||
|
||||
void SYoutubeUserManage::processFinished(QProcess *pPro, QString _strOut)
|
||||
{
|
||||
switch(m_nMode)
|
||||
{
|
||||
case E_PROCESS_FINISH_WAIT:
|
||||
m_nMode = E_PROCESS_RUN;
|
||||
m_bFinalLast = true;
|
||||
m_pMain->InsertLog("Finish Crawling :)");
|
||||
m_pMain->SetCrawlingState("Finish");
|
||||
m_ncList=1;
|
||||
m_bLast = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void SYoutubeUserManage::readStandardOutput()
|
||||
{
|
||||
QProcess *pPro = (QProcess*)sender();
|
||||
QThread::msleep(100);
|
||||
QString str = pPro->readAllStandardOutput();
|
||||
QStringList list = str.split("\n", QString::SkipEmptyParts);
|
||||
foreach(QString log,list)
|
||||
{
|
||||
if (m_pMain)
|
||||
{
|
||||
m_pMain->InsertLog(log);
|
||||
}
|
||||
else
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
void SYoutubeUserManage::readStandardError()
|
||||
{
|
||||
QProcess *pPro = (QProcess*)sender();
|
||||
QThread::msleep(100);
|
||||
QString str = pPro->readAllStandardError();
|
||||
QStringList list = str.split("\n", QString::SkipEmptyParts);
|
||||
foreach(QString log,list)
|
||||
{
|
||||
if (m_pMain)
|
||||
{
|
||||
m_pMain->InsertLog(log);
|
||||
}
|
||||
else
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
30
CrawlerList/syoutubeusermanage.h
Normal file
30
CrawlerList/syoutubeusermanage.h
Normal file
@@ -0,0 +1,30 @@
|
||||
#ifndef YOUTUBE_USER_MANAGE_H
|
||||
#define YOUTUBE_USER_MANAGE_H
|
||||
#include "smanage.h"
|
||||
|
||||
class SYoutubeUserManage : public SManage
|
||||
{
|
||||
Q_OBJECT
|
||||
public:
|
||||
enum E_PROCESS_STATE
|
||||
{
|
||||
E_PROCESS_RUN = 0,
|
||||
E_PROCESS_FINISH_WAIT,
|
||||
};
|
||||
SYoutubeUserManage(QObject *pObject);
|
||||
private:
|
||||
QString makeGetListQuery(QString _str,QDate _date,int _nPage);
|
||||
private:
|
||||
QString m_strListQuery;
|
||||
QVector <QString> m_strListURL;
|
||||
protected:
|
||||
bool Update();
|
||||
void Start();
|
||||
void processFinished(QProcess *pPro,QString _strOut);
|
||||
void ReLoadList();
|
||||
private slots:
|
||||
void readStandardOutput();
|
||||
void readStandardError();
|
||||
};
|
||||
#endif // YOUTUBE_USER_MANAGE_H
|
||||
|
||||
@@ -22,6 +22,10 @@
|
||||
#include "sfacebooktagmanage.h"
|
||||
#include "sfacebookusermanage.h"
|
||||
#include "snaverblogaccuracymanage.h"
|
||||
#include "stwittertagmanage.h"
|
||||
#include "stwitterusermanage.h"
|
||||
#include "syoutubetagmanage.h"
|
||||
#include "syoutubeusermanage.h"
|
||||
#include <QApplication>
|
||||
#include <QLabel>
|
||||
|
||||
@@ -51,6 +55,10 @@ Widget::Widget(QWidget *parent) : QWidget(parent) , m_nMode(E_MODE_WAIT)
|
||||
m_pFacebookTag = new SFacebookTagManage(this);
|
||||
m_pFacebookUser = new SFacebookUserManage(this);
|
||||
m_pNaverBlogAccuracy = new SNaverBlogAccuracyManage(this);
|
||||
m_pTwitterTag = new STwitterTagManage(this);
|
||||
m_pTwitterUser = new STwitterUserManage(this);
|
||||
m_pYoutubeTag = new SYoutubeTagManage(this);
|
||||
m_pYoutubeUser = new SYoutubeUserManage(this);
|
||||
|
||||
m_pManage[0] = m_pNaverCafe;
|
||||
m_pManage[1] = m_pNaverBlog;
|
||||
@@ -66,6 +74,10 @@ Widget::Widget(QWidget *parent) : QWidget(parent) , m_nMode(E_MODE_WAIT)
|
||||
m_pManage[11] = m_pFacebookTag;
|
||||
m_pManage[12] = m_pFacebookUser;
|
||||
m_pManage[13] = m_pNaverBlogAccuracy;
|
||||
m_pManage[14] = m_pTwitterTag;
|
||||
m_pManage[15] = m_pTwitterUser;
|
||||
m_pManage[16] = m_pYoutubeTag;
|
||||
m_pManage[17] = m_pYoutubeUser;
|
||||
|
||||
m_db = QSqlDatabase::addDatabase("QMYSQL");
|
||||
m_db.setHostName("bigbird.iptime.org");
|
||||
|
||||
@@ -26,6 +26,10 @@ class SInstaUserManage;
|
||||
class SFacebookTagManage;
|
||||
class SFacebookUserManage;
|
||||
class SNaverBlogAccuracyManage;
|
||||
class STwitterTagManage;
|
||||
class STwitterUserManage;
|
||||
class SYoutubeTagManage;
|
||||
class SYoutubeUserManage;
|
||||
|
||||
#define SAFE_DELETE(p) {if(p) delete (p); (p) = NULL; }
|
||||
|
||||
@@ -55,7 +59,7 @@ private:
|
||||
QLineEdit *m_pedStartDay;
|
||||
QTimer m_timer,m_timerAlive;
|
||||
QSqlDatabase m_db;
|
||||
static const int C_PLATFORM_MAX = 14;
|
||||
static const int C_PLATFORM_MAX = 18;
|
||||
SManage *m_pManage[C_PLATFORM_MAX];
|
||||
QListWidget *m_pResultList;
|
||||
QString m_strFileName;
|
||||
@@ -75,6 +79,11 @@ private:
|
||||
SFacebookTagManage *m_pFacebookTag;
|
||||
SFacebookUserManage *m_pFacebookUser;
|
||||
SNaverBlogAccuracyManage *m_pNaverBlogAccuracy;
|
||||
STwitterTagManage* m_pTwitterTag;
|
||||
STwitterUserManage* m_pTwitterUser;
|
||||
SYoutubeTagManage* m_pYoutubeTag;
|
||||
SYoutubeUserManage* m_pYoutubeUser;
|
||||
|
||||
int m_nStartTime,m_nRangeTime,m_nPlatform;
|
||||
//QGroupBox *m_pgbManual;
|
||||
QCheckBox *m_pcheckboxReal;
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#include "scrawler.h"
|
||||
#include "scrawler.h"
|
||||
|
||||
#include <QCoreApplication>
|
||||
#include <iostream>
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#include "scrawler.h"
|
||||
#include "scrawler.h"
|
||||
#include <iostream>
|
||||
#include <QSqlQuery>
|
||||
#include <QSqlError>
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#ifndef SCRAWLER_H
|
||||
#ifndef SCRAWLER_H
|
||||
#define SCRAWLER_H
|
||||
|
||||
#include <QtWebKitWidgets>
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#ifndef SCRAWLERDATA
|
||||
#ifndef SCRAWLERDATA
|
||||
#define SCRAWLERDATA
|
||||
|
||||
#endif // SCRAWLERDATA
|
||||
|
||||
@@ -85,6 +85,10 @@ Widget::Widget(QWidget *parent)
|
||||
"WHEN 11 THEN 'Facebook Tag' "
|
||||
"WHEN 12 THEN 'Facebook User' "
|
||||
"WHEN 13 THEN 'Naver Blog Accuracy' "
|
||||
"WHEN 14 THEN 'Twitter Tag' "
|
||||
"WHEN 15 THEN 'Twitter User' "
|
||||
"WHEN 16 THEN 'Youtube Tag' "
|
||||
"WHEN 17 THEN 'Youtube User' "
|
||||
"ELSE 'UnKnown'"
|
||||
"END AS platform FROM keyword where state is null");
|
||||
m_pmodelGroup->setQuery("SELECT * FROM datagroup");
|
||||
@@ -140,7 +144,7 @@ QGroupBox *Widget::setKeywordWidgets()
|
||||
m_pcbPlatform = new QComboBox;
|
||||
m_pcbPlatform->addItems(QStringList() << "Naver Cafe" << "Naver Blog" << "Daum Cafe" << "Naver News" << "Naver Cafe List" << "Daum Cafe List"
|
||||
<< "Kakao Story Channel" << "Kakao Story Tag" << "Kakao Story User" << "Instagram Tag" << "Instagram User"
|
||||
<< "Facebook Tag" << "Facebook User" << "Naver Blog Accuracy");
|
||||
<< "Facebook Tag" << "Facebook User" << "Naver Blog Accuracy" << "Twitter Tag" << "Twitter User" << "Youtube Tag" << "Youtube User");
|
||||
|
||||
m_pleKeyword = new QLineEdit;
|
||||
m_pleAuthorship = new QLineEdit;
|
||||
@@ -380,6 +384,10 @@ void Widget::on_keyword_currentRowChanged(QModelIndex _index)
|
||||
if (str == QString("Facebook Tag")) nSelect = 11;
|
||||
if (str == QString("Facebook User")) nSelect = 12;
|
||||
if (str == QString("Naver Blog Accuracy")) nSelect = 13;
|
||||
if (str == QString("Twitter Tag")) nSelect = 14;
|
||||
if (str == QString("Twitter User")) nSelect = 15;
|
||||
if (str == QString("Youtube Tag")) nSelect = 16;
|
||||
if (str == QString("Youtube User")) nSelect = 17;
|
||||
m_pcbPlatform->setCurrentIndex(nSelect);
|
||||
}
|
||||
}
|
||||
@@ -504,6 +512,10 @@ void Widget::on_keyword_button_insert()
|
||||
"WHEN 11 THEN 'Facebook Tag' "
|
||||
"WHEN 12 THEN 'Facebook User' "
|
||||
"WHEN 13 THEN 'Naver Blog Accuracy' "
|
||||
"WHEN 14 THEN 'Twitter Tag' "
|
||||
"WHEN 15 THEN 'Twitter User' "
|
||||
"WHEN 16 THEN 'Youtube Tag' "
|
||||
"WHEN 17 THEN 'Youtube User' "
|
||||
"ELSE 'UnKnown'"
|
||||
"END AS platform FROM keyword where state is null");
|
||||
}
|
||||
@@ -535,6 +547,10 @@ void Widget::on_keyword_button_delete()
|
||||
"WHEN 11 THEN 'Facebook Tag' "
|
||||
"WHEN 12 THEN 'Facebook User' "
|
||||
"WHEN 13 THEN 'Naver Blog Accuracy' "
|
||||
"WHEN 14 THEN 'Twitter Tag' "
|
||||
"WHEN 15 THEN 'Twitter User' "
|
||||
"WHEN 16 THEN 'Youtube Tag' "
|
||||
"WHEN 17 THEN 'Youtube User' "
|
||||
"ELSE 'UnKnown'"
|
||||
"END AS platform FROM keyword where state is null");
|
||||
}
|
||||
@@ -576,6 +592,10 @@ void Widget::on_keyword_button_modify()
|
||||
"WHEN 11 THEN 'Facebook Tag' "
|
||||
"WHEN 12 THEN 'Facebook User' "
|
||||
"WHEN 13 THEN 'Naver Blog Accuracy' "
|
||||
"WHEN 14 THEN 'Twitter Tag' "
|
||||
"WHEN 15 THEN 'Twitter User' "
|
||||
"WHEN 16 THEN 'Youtube Tag' "
|
||||
"WHEN 17 THEN 'Youtube User' "
|
||||
"ELSE 'UnKnown'"
|
||||
"END AS platform FROM keyword where state is null");
|
||||
}
|
||||
@@ -1100,7 +1120,14 @@ void Widget::on_group_button_copy_start()
|
||||
void Widget::UpdateCrawling()
|
||||
{
|
||||
m_pmodelCrawling->setQuery("SELECT _crawling.id,_keyword.realtime,_keyword.searches,_keyword.start,_keyword.end, _datagroup.name , "
|
||||
"(CASE _keyword.platform WHEN 0 THEN 'Naver Cafe' WHEN 1 THEN 'Naver Blog' WHEN 2 THEN 'Daum Cafe' WHEN 3 THEN 'Naver News' WHEN 4 THEN 'Naver Cafe List' WHEN 5 THEN 'Daum Cafe List' WHEN 6 THEN 'Kakao Story Channel' "
|
||||
"(CASE _keyword.platform "
|
||||
"WHEN 0 THEN 'Naver Cafe' "
|
||||
"WHEN 1 THEN 'Naver Blog' "
|
||||
"WHEN 2 THEN 'Daum Cafe' "
|
||||
"WHEN 3 THEN 'Naver News' "
|
||||
"WHEN 4 THEN 'Naver Cafe List' "
|
||||
"WHEN 5 THEN 'Daum Cafe List' "
|
||||
"WHEN 6 THEN 'Kakao Story Channel' "
|
||||
"WHEN 7 THEN 'Kakao Story Tag' "
|
||||
"WHEN 8 THEN 'Kakao Story User' "
|
||||
"WHEN 9 THEN 'Instagram Tag' "
|
||||
@@ -1108,6 +1135,10 @@ void Widget::UpdateCrawling()
|
||||
"WHEN 11 THEN 'Facebook Tag' "
|
||||
"WHEN 12 THEN 'Facebook User' "
|
||||
"WHEN 13 THEN 'Naver Blog Accuracy' "
|
||||
"WHEN 14 THEN 'Twitter Tag' "
|
||||
"WHEN 15 THEN 'Twitter User' "
|
||||
"WHEN 16 THEN 'Youtube Tag' "
|
||||
"WHEN 17 THEN 'Youtube User' "
|
||||
"ELSE 'UnKnown' END ) AS platform , "
|
||||
"(CASE _crawling.state WHEN 0 THEN 'Waiting' WHEN 1 THEN 'Running' WHEN 2 THEN 'Terminated' ELSE 'None' END ) AS state "
|
||||
"FROM crawling _crawling INNER JOIN keyword _keyword ON _crawling.keyword_id = _keyword.id "
|
||||
|
||||
@@ -32,6 +32,7 @@ def is_debugger_attached():
|
||||
|
||||
is_debug = is_debugger_attached()
|
||||
|
||||
|
||||
def printl(*objects, sep=' ', end='\n', file=None, flush=True):
|
||||
if is_debug:
|
||||
cur_frame = inspect.currentframe()
|
||||
|
||||
79
WebBasedCrawler/base/dbdata.py
Normal file
79
WebBasedCrawler/base/dbdata.py
Normal file
@@ -0,0 +1,79 @@
|
||||
from pymysql.connections import Connection
|
||||
import datetime
|
||||
from numbers import Number
|
||||
|
||||
class DataDBRow:
|
||||
def __init__(self):
|
||||
self.platform_name = None
|
||||
self.platform_form = None
|
||||
self.platform_title = None
|
||||
self.article_form = None
|
||||
self.article_parent = None
|
||||
self.article_id = None
|
||||
self.article_nickname = None
|
||||
self.article_title = None
|
||||
self.article_data = None
|
||||
self.article_url = None
|
||||
self.article_hit = 0
|
||||
self.article_date = None
|
||||
self.article_order = 0
|
||||
self.article_profile = None
|
||||
self.article_profileurl = None
|
||||
self.platform_id = None
|
||||
self.keyword_id = -1
|
||||
self.reply_url = None
|
||||
self.etc = None
|
||||
|
||||
def get_keys(self):
|
||||
inst = DataDBRow()
|
||||
keys = ()
|
||||
for key, value_type in inst.__dict__.items():
|
||||
if key.startswith('__') or callable(value_type):
|
||||
continue
|
||||
|
||||
keys += key,
|
||||
|
||||
return keys
|
||||
|
||||
def get_values(self, conn, db_num):
|
||||
inst = DataDBRow()
|
||||
values = ()
|
||||
for key, value_type in inst.__dict__.items():
|
||||
if key.startswith('__') or callable(value_type):
|
||||
continue
|
||||
|
||||
value = self.__dict__[key]
|
||||
if isinstance(value, Number):
|
||||
values += str(value),
|
||||
elif isinstance(value, str):
|
||||
values += conn.escape(value.encode('utf8').decode('utf8')),
|
||||
else:
|
||||
values += conn.escape(value),
|
||||
|
||||
return values
|
||||
|
||||
def get_insert_query(self, conn, db_num):
|
||||
|
||||
inst = DataDBRow()
|
||||
|
||||
keys = ''
|
||||
values = ''
|
||||
for key, value_type in inst.__dict__.items():
|
||||
if key.startswith('__') or callable(value_type):
|
||||
continue
|
||||
|
||||
if len(keys) > 0:
|
||||
keys += ', '
|
||||
values += ', '
|
||||
|
||||
keys += key
|
||||
value = self.__dict__[key]
|
||||
if isinstance(value, Number):
|
||||
values += str(value)
|
||||
elif isinstance(value, str):
|
||||
values += conn.escape(value.encode('utf8').decode('utf8'))
|
||||
else:
|
||||
values += conn.escape(value)
|
||||
|
||||
query = 'insert into data_{} ({}) values ({})'.format(db_num, keys, values)
|
||||
return query
|
||||
@@ -97,6 +97,31 @@ def get_driver(platform, proxies):
|
||||
else:
|
||||
return platform_webdriver[platform](capabilities=desired_capabilities)
|
||||
|
||||
_expired_proxies = []
|
||||
|
||||
|
||||
def set_proxy_expired(proxy):
|
||||
if proxy not in _expired_proxies:
|
||||
_expired_proxies.append(proxy)
|
||||
|
||||
address = proxy['http'][len('http://'):]
|
||||
|
||||
with open(proxy_filename, 'r') as f:
|
||||
lines = f.readlines()
|
||||
|
||||
expired_idx = -1
|
||||
for idx, line in enumerate(lines):
|
||||
if line.startswith(address):
|
||||
expired_idx = idx
|
||||
break
|
||||
|
||||
if expired_idx >= 0:
|
||||
lines[expired_idx] = '# ' + lines[expired_idx]
|
||||
lines.append(lines.pop(expired_idx))
|
||||
|
||||
with open(proxy_filename, 'w') as f:
|
||||
f.writelines(lines)
|
||||
|
||||
|
||||
def get_proxy_from_file(filename):
|
||||
"""
|
||||
@@ -104,7 +129,7 @@ def get_proxy_from_file(filename):
|
||||
:return (ip, port): string, string
|
||||
if ip, port or filename is invalid, return (None, None)
|
||||
"""
|
||||
proxy_lists = [line.replace('\n', '') for line in open(filename) if re_ip.search(line)]
|
||||
proxy_lists = [line.replace('\n', '') for line in open(filename) if not line.strip().startswith('#') and re_ip.search(line)]
|
||||
if proxy_lists:
|
||||
m = re_ip.search(proxy_lists[random.randint(0, len(proxy_lists) - 1)])
|
||||
if m:
|
||||
|
||||
3
WebBasedCrawler/requirements.txt
Normal file
3
WebBasedCrawler/requirements.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
requests
|
||||
bs4
|
||||
pytz
|
||||
0
WebBasedCrawler/twitter/__init__.py
Normal file
0
WebBasedCrawler/twitter/__init__.py
Normal file
62
WebBasedCrawler/twitter/twconfig.py
Normal file
62
WebBasedCrawler/twitter/twconfig.py
Normal file
@@ -0,0 +1,62 @@
|
||||
import datetime
|
||||
import copy
|
||||
|
||||
class TwitterConfig:
|
||||
protocol = 'https'
|
||||
top_url = 'twitter.com'
|
||||
search_url = '/i/search/timeline'
|
||||
conversation_url_form = '/i/{}/conversation/{}'
|
||||
|
||||
def __init__(self):
|
||||
self.keyword_id = -1
|
||||
self.db_num = -1
|
||||
|
||||
self.id = 0
|
||||
self.realtime = False
|
||||
self.keywords = []
|
||||
self.start_str = None
|
||||
self.start = None
|
||||
self.end_str = None
|
||||
self.end = None
|
||||
self.authorship = None
|
||||
self.state = None
|
||||
self.platform = None
|
||||
|
||||
def set_param(self, keyword_id, db_num, params):
|
||||
self.keyword_id = int(keyword_id)
|
||||
self.db_num = int(db_num)
|
||||
|
||||
self.id = int(params['id'])
|
||||
self.realtime = params['realtime'] == '1'
|
||||
|
||||
self.keywords = []
|
||||
for keyword in params['searches'].split(','):
|
||||
self.keywords.append(keyword.strip())
|
||||
|
||||
self.start_str = str(params['start'])
|
||||
self.end_str = str(params['end'])
|
||||
|
||||
self.start = datetime.datetime.combine(params['start'], datetime.datetime.min.time())
|
||||
self.end = datetime.datetime.combine(params['end'], datetime.datetime.min.time())
|
||||
|
||||
self.authorship = params['authorship']
|
||||
self.state = params['state']
|
||||
self.platform = params['platform']
|
||||
|
||||
def split(self):
|
||||
split_list = []
|
||||
new_end = self.end
|
||||
|
||||
while new_end > self.start:
|
||||
new_config = copy.deepcopy(self)
|
||||
|
||||
new_config.end = new_end
|
||||
new_end = new_end + datetime.timedelta(days=-1)
|
||||
new_config.start = new_end
|
||||
|
||||
new_config.start_str = new_config.start.strftime('%Y-%m-%d')
|
||||
new_config.end_str = new_config.end.strftime('%Y-%m-%d')
|
||||
|
||||
split_list.append(new_config)
|
||||
|
||||
return split_list
|
||||
79
WebBasedCrawler/twitter/twdbhelper.py
Normal file
79
WebBasedCrawler/twitter/twdbhelper.py
Normal file
@@ -0,0 +1,79 @@
|
||||
from twitter.tweet import Tweet
|
||||
import multiprocessing as mp
|
||||
|
||||
|
||||
class TwitterDBHelper:
|
||||
pymysql = __import__('pymysql.cursors')
|
||||
|
||||
def __init__(self):
|
||||
self.tweets = []
|
||||
self.buffer = []
|
||||
self.lock = mp.Lock()
|
||||
pass
|
||||
|
||||
def __del__(self):
|
||||
pass
|
||||
|
||||
def get_param(self, keyword_id):
|
||||
query = "select * from keyword where id = " + str(keyword_id)
|
||||
params = []
|
||||
try:
|
||||
conn = self.pymysql.connect(host='bigbird.iptime.org',
|
||||
user='admin', passwd='admin123',
|
||||
db='concepters', charset='utf8',
|
||||
cursorclass=self.pymysql.cursors.DictCursor)
|
||||
|
||||
with conn.cursor() as cursor:
|
||||
cursor.execute(query)
|
||||
params = cursor.fetchone()
|
||||
|
||||
except Exception as e:
|
||||
print(e)
|
||||
exit(1)
|
||||
|
||||
else:
|
||||
conn.close()
|
||||
|
||||
return params
|
||||
|
||||
def insert_tweet(self, tweet: Tweet = None, db_num: int = -1, flush=False):
|
||||
|
||||
# self.lock.acquire()
|
||||
# if tweet is not None:
|
||||
# self.buffer.append((tweet, db_num, ))
|
||||
#
|
||||
# local_buffer = None
|
||||
# if len(self.buffer) >= 100 or flush:
|
||||
# local_buffer = copy.deepcopy(self.buffer)
|
||||
# self.buffer.clear()
|
||||
# self.lock.release()
|
||||
|
||||
local_buffer = [(tweet, db_num, )]
|
||||
if local_buffer:
|
||||
while True:
|
||||
try:
|
||||
conn = self.pymysql.connect(host='bigbird.iptime.org',
|
||||
user='admin', passwd='admin123',
|
||||
db='concepters', charset='utf8',
|
||||
cursorclass=self.pymysql.cursors.DictCursor,
|
||||
connect_timeout=5)
|
||||
|
||||
except Exception as e:
|
||||
print(e)
|
||||
continue
|
||||
|
||||
else:
|
||||
break
|
||||
|
||||
try:
|
||||
with conn.cursor() as cursor:
|
||||
for tweet, _db_num in local_buffer:
|
||||
query = tweet.get_insert_query(conn, _db_num)
|
||||
cursor.execute(query)
|
||||
conn.commit()
|
||||
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
finally:
|
||||
conn.close()
|
||||
24
WebBasedCrawler/twitter/tweet.py
Normal file
24
WebBasedCrawler/twitter/tweet.py
Normal file
@@ -0,0 +1,24 @@
|
||||
from base.dbdata import DataDBRow
|
||||
|
||||
|
||||
class Tweet(DataDBRow):
|
||||
|
||||
def __init__(self):
|
||||
super(self.__class__, self).__init__()
|
||||
|
||||
self.tweet_id = None
|
||||
self.user_id = None
|
||||
self.user_name = None
|
||||
self.text = None
|
||||
self.created_at = None
|
||||
self.retweets = 0
|
||||
self.favorites = 0
|
||||
|
||||
self.is_reply = False
|
||||
self.reply_cnt = 0
|
||||
self.retweet_cnt = 0
|
||||
self.favorite_cnt = 0
|
||||
self.top_link = None
|
||||
self.tweet_link = None
|
||||
|
||||
self.depth = 0
|
||||
289
WebBasedCrawler/twitter/twittercrawl.py
Normal file
289
WebBasedCrawler/twitter/twittercrawl.py
Normal file
@@ -0,0 +1,289 @@
|
||||
from twitter.twconfig import TwitterConfig
|
||||
from twitter.twdbhelper import TwitterDBHelper
|
||||
from twitter.tweet import Tweet
|
||||
from twitter.twparser import TweetParser
|
||||
|
||||
import base.proxy
|
||||
import base.baseclasses
|
||||
|
||||
import requests
|
||||
import bs4
|
||||
import json
|
||||
import urllib
|
||||
import threading
|
||||
import queue
|
||||
import time
|
||||
|
||||
|
||||
class TwitterCrawler():
|
||||
|
||||
def __init__(self):
|
||||
self.default_config = TwitterConfig()
|
||||
self.db_helper = TwitterDBHelper()
|
||||
|
||||
def set_arguments(self, browser, keyword_id, db_num, before_day, until_page):
|
||||
params = self.db_helper.get_param(keyword_id)
|
||||
self.default_config.set_param(keyword_id, db_num, params)
|
||||
|
||||
@staticmethod
|
||||
def get_timeline_url(query, start_str, end_str, max_position=''):
|
||||
params = {
|
||||
'f': 'tweets',
|
||||
'vertical': 'default',
|
||||
'src': 'typd',
|
||||
'q': '{} since:{} until:{}'.format(query, start_str, end_str),
|
||||
'language': 'en',
|
||||
'max_position': max_position,
|
||||
}
|
||||
|
||||
url_tupple = (TwitterConfig.protocol, TwitterConfig.top_url, TwitterConfig.search_url, '', urllib.parse.urlencode(params), '')
|
||||
return urllib.parse.urlunparse(url_tupple)
|
||||
|
||||
@staticmethod
|
||||
def get_content_url(user_id, tweet_id, max_position=''):
|
||||
params = {
|
||||
'max_position': max_position,
|
||||
}
|
||||
|
||||
sub_url = TwitterConfig.conversation_url_form.format(user_id, tweet_id)
|
||||
url_tupple = (TwitterConfig.protocol, TwitterConfig.top_url, sub_url, '', urllib.parse.urlencode(params), '')
|
||||
return urllib.parse.urlunparse(url_tupple)
|
||||
|
||||
@staticmethod
|
||||
def get_page(url, proc_id):
|
||||
headers = {
|
||||
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36',
|
||||
'Accept-Language': 'ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4',
|
||||
}
|
||||
# if proxies is None:
|
||||
proxies = base.proxy.get_proxy_for_requests()
|
||||
|
||||
resp = None
|
||||
while True:
|
||||
try:
|
||||
resp = requests.get(url, headers=headers, proxies=proxies, timeout=3)
|
||||
except Exception as e:
|
||||
if proxies == (None, None):
|
||||
break
|
||||
|
||||
print('[{}] proxy {} is expired. ({})'.format(proc_id, proxies, e))
|
||||
base.proxy.set_proxy_expired(proxies)
|
||||
proxies = base.proxy.get_proxy_for_requests()
|
||||
else:
|
||||
break
|
||||
|
||||
return resp
|
||||
|
||||
def runner_proc(self, proc_id, content_queue, result_queue, config):
|
||||
print('{} to {} runner thread start'.format(config.start_str, config.end_str))
|
||||
|
||||
b_continue = True
|
||||
min_tweet_id = None
|
||||
max_tweet_id = None
|
||||
max_position = ''
|
||||
tweet_count = 0
|
||||
|
||||
while b_continue:
|
||||
if min_tweet_id is not None:
|
||||
max_position = 'TWEET-{}-{}'.format(max_tweet_id, min_tweet_id)
|
||||
url = self.get_timeline_url(config.keywords[0], config.start_str, config.end_str, max_position)
|
||||
resp = self.get_page(url, proc_id)
|
||||
if resp is None:
|
||||
break
|
||||
|
||||
j = json.loads(resp.content.decode('utf-8'))
|
||||
soup = bs4.BeautifulSoup(j['items_html'], 'lxml')
|
||||
tweet_tags = soup.select("div.tweet")
|
||||
|
||||
for tw in tweet_tags:
|
||||
tweet = TweetParser.parse(tw, config.keyword_id)
|
||||
|
||||
if tweet.is_reply is True:
|
||||
# print(' ## {}: {}...'.format(tweet.user_name, tweet.text[:20]))
|
||||
continue
|
||||
|
||||
if tweet.reply_cnt > 0:
|
||||
self.insert_content_pool(proc_id, content_queue, tweet, tweet)
|
||||
|
||||
self.db_helper.insert_tweet(tweet, config.db_num)
|
||||
|
||||
# print('{} {}: {}...'.format(tweet.created_at, tweet.user_name, tweet.text[:20]))
|
||||
|
||||
count = len(tweet_tags)
|
||||
if count == 0:
|
||||
break
|
||||
|
||||
if min_tweet_id is None:
|
||||
min_tweet_id = tweet_tags[0].attrs['data-item-id']
|
||||
max_tweet_id = tweet_tags[-1].attrs['data-item-id']
|
||||
tweet_count += count
|
||||
|
||||
print('{} to {} runner thread finished {}'.format(config.start_str, config.end_str, tweet_count))
|
||||
result_queue.put((proc_id, tweet_count, ))
|
||||
# self.runner_processing[proc_id].value = False
|
||||
return proc_id, tweet_count,
|
||||
|
||||
@staticmethod
|
||||
def insert_content_pool(proc_id: int, qu, tweet: Tweet, tweet_top: Tweet):
|
||||
# print(' [{}] pool insert: {} ({})'.format(proc_id, tweet.text[:20] if tweet.text else '', tweet.tweet_link))
|
||||
qu.put((tweet, tweet_top,))
|
||||
|
||||
@staticmethod
|
||||
def get_content(content_queue):
|
||||
sleep_time = time.time()
|
||||
while True:
|
||||
try:
|
||||
parent_tw, top_tw, = content_queue.get(block=True, timeout=2)
|
||||
except Exception as e:
|
||||
if time.time()-sleep_time > 60:
|
||||
break
|
||||
else:
|
||||
continue
|
||||
else:
|
||||
return parent_tw, top_tw,
|
||||
|
||||
return None, None,
|
||||
|
||||
def content_proc(self, proc_id, content_queue, result_queue):
|
||||
print('[{}] content thread start'.format(proc_id))
|
||||
|
||||
tweet_count = 0
|
||||
while True:
|
||||
parent_tw, top_tw, = self.get_content(content_queue)
|
||||
if not parent_tw:
|
||||
break
|
||||
|
||||
# print(' [{}] <<< parent : {} ({})'.format(proc_id, parent_tw.text[:20], parent_tw.tweet_link))
|
||||
|
||||
max_position = ''
|
||||
|
||||
b_continue = True
|
||||
while b_continue:
|
||||
url = self.get_content_url(parent_tw.user_id, parent_tw.tweet_id, max_position)
|
||||
resp = self.get_page(url, proc_id)
|
||||
if resp is None or resp.status_code == 404:
|
||||
break
|
||||
elif resp.status_code != 200:
|
||||
print('[WARNING] content_get code {}'.format(resp.status_code))
|
||||
continue
|
||||
|
||||
j = json.loads(resp.content.decode('utf-8'))
|
||||
soup = bs4.BeautifulSoup(j['items_html'], 'lxml')
|
||||
|
||||
reply_container_tags = soup.select('li.ThreadedConversation')
|
||||
reply_container_tags += TweetParser.get_lone_container(soup, parent_tw)
|
||||
for container_tags in reply_container_tags:
|
||||
tweet_tags = container_tags.select('div.tweet')
|
||||
if len(tweet_tags) > 0:
|
||||
tweet = TweetParser.parse(tweet_tags[0], self.default_config.keyword_id, parent_tw.depth+1, top_tw)
|
||||
# print('[{}]>>> {} {}: {} ({}) ({})'.format(proc_id, tweet.created_at, tweet.user_name, tweet.text[:20], tweet.depth, tweet.tweet_link))
|
||||
self.insert_content_pool(proc_id, content_queue, tweet, top_tw)
|
||||
self.db_helper.insert_tweet(tweet, self.default_config.db_num)
|
||||
tweet_count += 1
|
||||
|
||||
b_continue = j['has_more_items']
|
||||
if b_continue:
|
||||
max_position = j['min_position']
|
||||
|
||||
result_queue.put((proc_id, tweet_count))
|
||||
print('[{}] content thread finished'.format(proc_id))
|
||||
return proc_id, tweet_count,
|
||||
|
||||
def debug_content(self):
|
||||
content_qu = queue.Queue()
|
||||
runner_result_qu = queue.Queue()
|
||||
content_result_qu = queue.Queue()
|
||||
|
||||
test_tw = Tweet()
|
||||
# test_tw.tweet_link = 'https://twitter.com/yniold_/status/886863893137678337'
|
||||
# test_tw.user_id = 'yniold_'
|
||||
# test_tw.tweet_id = 886863893137678337
|
||||
|
||||
test_tw.tweet_link = 'https://twitter.com/Awesome_vely/status/888704413111435264'
|
||||
test_tw.user_id = 'Awesome_vely'
|
||||
test_tw.tweet_id = 888704413111435264
|
||||
|
||||
test_tw.text = '시작'
|
||||
self.insert_content_pool(0, content_qu, test_tw, test_tw)
|
||||
|
||||
content_threads = [threading.Thread(target=self.content_proc, args=(proc_id, content_qu, content_result_qu)) for proc_id in range(16)]
|
||||
[th.start() for th in content_threads]
|
||||
[th.join() for th in content_threads]
|
||||
|
||||
while not content_result_qu.empty():
|
||||
res = content_result_qu.get()
|
||||
print('reply : {}'.format(res))
|
||||
|
||||
print('end all')
|
||||
|
||||
def test_insert_db(self):
|
||||
test_tw = Tweet()
|
||||
test_tw.tweet_link = 'https://twitter.com/moonriver365/status/885797401033818112'
|
||||
test_tw.user_id = 'moonriver365'
|
||||
test_tw.tweet_id = 885797401033818112
|
||||
for _ in range(5):
|
||||
self.db_helper.insert_tweet(test_tw, self.default_config.db_num)
|
||||
|
||||
def debug(self):
|
||||
if base.baseclasses.is_debug:
|
||||
## check proxy
|
||||
# base.proxy.get_proxy_from_file('proxy.txt')
|
||||
# proxy = {'https': 'http://45.56.86.93:3128', 'http': 'http://45.56.86.93:3128'}
|
||||
# base.proxy.set_proxy_expired(proxy)
|
||||
# return
|
||||
|
||||
## contents check
|
||||
self.debug_content()
|
||||
|
||||
# split_config = self.default_config.split()
|
||||
|
||||
# self.test_insert_db()
|
||||
|
||||
print("debug end")
|
||||
# exit()
|
||||
|
||||
def start(self):
|
||||
start_time = time.time()
|
||||
|
||||
# self.debug()
|
||||
# return
|
||||
|
||||
# run
|
||||
split_config = self.default_config.split()
|
||||
content_qu = queue.Queue()
|
||||
runner_result_qu = queue.Queue()
|
||||
content_result_qu = queue.Queue()
|
||||
|
||||
runner_threads = [threading.Thread(target=self.runner_proc, args=(proc_id, content_qu, runner_result_qu, config)) for proc_id, config in enumerate(split_config)]
|
||||
content_threads = [threading.Thread(target=self.content_proc, args=(proc_id, content_qu, content_result_qu)) for proc_id in range(16)]
|
||||
|
||||
[th.start() for th in runner_threads]
|
||||
[th.start() for th in content_threads]
|
||||
|
||||
[th.join() for th in runner_threads]
|
||||
[th.join() for th in content_threads]
|
||||
|
||||
# rerun zero runners
|
||||
runner_threads = []
|
||||
runner_result_qu2 = queue.Queue()
|
||||
idx = 0
|
||||
while not runner_result_qu.empty():
|
||||
res = runner_result_qu.get()
|
||||
if res == 0:
|
||||
th = threading.Thread(target=self.runner_proc, args=(idx, content_qu, runner_result_qu2, split_config[idx]))
|
||||
runner_threads.append(th)
|
||||
|
||||
idx += 1
|
||||
content_threads = [threading.Thread(target=self.content_proc, args=(proc_id, content_qu, content_result_qu)) for proc_id in range(16)]
|
||||
|
||||
[th.start() for th in runner_threads]
|
||||
[th.start() for th in content_threads]
|
||||
|
||||
[th.join() for th in runner_threads]
|
||||
[th.join() for th in content_threads]
|
||||
|
||||
# print running time
|
||||
delta = time.time() - start_time
|
||||
m, s = divmod(delta, 60)
|
||||
h, m = divmod(m, 60)
|
||||
print("finished all {}:{:02d}:{:02d} ".format(int(h), int(m), int(s)))
|
||||
96
WebBasedCrawler/twitter/twparser.py
Normal file
96
WebBasedCrawler/twitter/twparser.py
Normal file
@@ -0,0 +1,96 @@
|
||||
from twitter.tweet import Tweet
|
||||
from twitter.twconfig import TwitterConfig
|
||||
|
||||
import bs4
|
||||
import datetime
|
||||
import pytz
|
||||
|
||||
class TweetParser:
|
||||
|
||||
@staticmethod
|
||||
def parse(tag, keyword_id, depth=0, top_tw: Tweet=None):
|
||||
tweet = Tweet()
|
||||
|
||||
tweet.tweet_id = int(tag.attrs['data-tweet-id'])
|
||||
|
||||
nickname_tag = tag.select('strong.fullname')[0]
|
||||
tweet.user_name = ''
|
||||
for child in nickname_tag.children:
|
||||
if isinstance(child, bs4.element.NavigableString):
|
||||
if len(tweet.user_name) > 0:
|
||||
tweet.user_name += ' '
|
||||
tweet.user_name += child
|
||||
tweet.user_id = tag.select('span.username')[0].text[1:]
|
||||
tweet.text = tag.select('p.tweet-text')[0].text
|
||||
|
||||
# time_str = tag.select('a.tweet-timestamp')[0].attrs['title']
|
||||
# english
|
||||
# tweet.created_at = datetime.datetime.strptime(time_str, '%I:%M %p - %d %b %Y')
|
||||
# korean
|
||||
# time_str = time_str.replace('오전', 'AM').replace('오후', 'PM')
|
||||
# tweet.created_at = datetime.datetime.strptime(time_str, '%p %I:%M - %Y년 %m월 %d일')
|
||||
|
||||
timestamp = int(tag.select('span._timestamp')[0].attrs['data-time'])
|
||||
utc_dt = datetime.datetime.utcfromtimestamp(timestamp)
|
||||
local_tz = pytz.timezone('Asia/Seoul')
|
||||
local_dt = utc_dt.replace(tzinfo=pytz.utc).astimezone(local_tz)
|
||||
tweet.created_at = local_tz.normalize(local_dt)
|
||||
|
||||
reply_tag = tag.select('div.ReplyingToContextBelowAuthor')
|
||||
tweet.is_reply = len(reply_tag) > 0
|
||||
|
||||
reply_cnt_tag = tag.select('span.ProfileTweet-action--reply > span.ProfileTweet-actionCount')
|
||||
if len(reply_cnt_tag) > 0:
|
||||
tweet.reply_cnt = int(reply_cnt_tag[0].attrs['data-tweet-stat-count'])
|
||||
|
||||
retweet_cnt_tag = tag.select('span.ProfileTweet-action--retweet > span.ProfileTweet-actionCount')
|
||||
if len(retweet_cnt_tag) > 0:
|
||||
tweet.retweet_cnt = int(retweet_cnt_tag[0].attrs['data-tweet-stat-count'])
|
||||
|
||||
favorite_cnt_tag = tag.select('span.ProfileTweet-action--favorite > span.ProfileTweet-actionCount')
|
||||
if len(favorite_cnt_tag) > 0:
|
||||
tweet.favorites_cnt = int(favorite_cnt_tag[0].attrs['data-tweet-stat-count'])
|
||||
|
||||
link_tag = tag.select('a.js-permalink')
|
||||
if len(link_tag) > 0:
|
||||
tweet.tweet_link = TwitterConfig.protocol + '://' + TwitterConfig.top_url + link_tag[0].attrs['href']
|
||||
tweet.top_link = top_tw.tweet_link if top_tw else tweet.tweet_link
|
||||
|
||||
tweet.depth = depth
|
||||
|
||||
tweet.platform_name = 'twitter'
|
||||
tweet.platform_form = 'post'
|
||||
tweet.platform_title = top_tw.user_id if top_tw else tweet.user_id
|
||||
tweet.article_form = 'body' if tweet.depth is 0 else 'reply'
|
||||
# tweet.article_parent = None
|
||||
tweet.article_id = tweet.user_id
|
||||
tweet.article_nickname = tweet.user_name
|
||||
# tweet.article_title = None
|
||||
tweet.article_data = tweet.text
|
||||
tweet.article_url = tweet.top_link
|
||||
# tweet.article_hit = 0
|
||||
tweet.article_date = tweet.created_at
|
||||
tweet.article_order = tweet.depth
|
||||
# tweet.article_profile = tweet.user_name
|
||||
tweet.article_profileurl = TwitterConfig.protocol + '://' + TwitterConfig.top_url + '/' + tweet.user_id
|
||||
tweet.platform_id = top_tw.user_id if top_tw else tweet.user_id
|
||||
tweet.keyword_id = keyword_id
|
||||
tweet.reply_url = tweet.tweet_link
|
||||
# tweet.etc = ''
|
||||
|
||||
return tweet
|
||||
|
||||
@staticmethod
|
||||
def get_lone_container(soup, parent_tw):
|
||||
lone_tweets = soup.select('div.ThreadedConversation--loneTweet')
|
||||
container_tags = []
|
||||
for tag in reversed(lone_tweets):
|
||||
li = tag.select('li.stream-item')
|
||||
if len(li) > 0 and 'data-item-id' in li[0].attrs:
|
||||
tweet_id = int(li[0].attrs['data-item-id'])
|
||||
if tweet_id == parent_tw.tweet_id:
|
||||
break
|
||||
|
||||
container_tags.append(tag)
|
||||
|
||||
return reversed(container_tags)
|
||||
@@ -11,6 +11,8 @@ from kakao import kakaocrawl
|
||||
from naver import navercrawl
|
||||
from facebook import facebookcrawl
|
||||
from facebook import facebookcrawlbs
|
||||
from twitter import twittercrawl
|
||||
from youtube import youtubecrawl
|
||||
|
||||
from base.baseclasses import print_and_flush
|
||||
|
||||
@@ -26,8 +28,12 @@ class WebBasedCrawler:
|
||||
self.crawler = kakaocrawl.KakaoMainCrawler()
|
||||
elif platform == "navercafe":
|
||||
self.crawler = navercrawl.NaverCafeMainAreaCrawler()
|
||||
elif platform == "facebook":
|
||||
elif platform == 'facebook':
|
||||
self.crawler = facebookcrawlbs.FacebookMainCrawler()
|
||||
elif platform == 'twitter':
|
||||
self.crawler = twittercrawl.TwitterCrawler()
|
||||
elif platform == 'youtube':
|
||||
self.crawler = youtubecrawl.YoutubeMainCrawler()
|
||||
else:
|
||||
self.crawler = None
|
||||
raise Exception
|
||||
@@ -38,7 +44,7 @@ class WebBasedCrawler:
|
||||
|
||||
|
||||
browser_opt = ('chrome', "ie", "opera", "firefox")
|
||||
platform_opt = ('instagram', 'kakaostory', 'navercafe', "facebook")
|
||||
platform_opt = ('instagram', 'kakaostory', 'navercafe', 'facebook', 'twitter', 'youtube')
|
||||
|
||||
|
||||
def get_browser_info(platform_, file_name="browser.txt"):
|
||||
@@ -73,7 +79,7 @@ def get_browser_info(platform_, file_name="browser.txt"):
|
||||
if __name__ == '__main__':
|
||||
"""
|
||||
sys.argv[0] webbasedcrawler.py
|
||||
sys.argv[1] instagram, kakaochannel, navercafe, facebook
|
||||
sys.argv[1] instagram, kakaochannel, navercafe, facebook, twitter, youtube
|
||||
sys.argv[2] keyword_id
|
||||
sys.argv[3] data group
|
||||
sys.argv[4] start_day
|
||||
@@ -85,8 +91,7 @@ if __name__ == '__main__':
|
||||
else:
|
||||
print_and_flush("Check Argumenets!")
|
||||
exit(1)
|
||||
crawler = WebBasedCrawler(get_browser_info(sys.argv[1]), sys.argv[1], sys.argv[2],
|
||||
sys.argv[3], sys.argv[4], sys.argv[5])
|
||||
crawler = WebBasedCrawler(get_browser_info(sys.argv[1]), sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5])
|
||||
crawler.start()
|
||||
print_and_flush("Finished Crawling :)")
|
||||
exit(0)
|
||||
|
||||
0
WebBasedCrawler/youtube/__init__.py
Normal file
0
WebBasedCrawler/youtube/__init__.py
Normal file
7
WebBasedCrawler/youtube/youtubecrawl.py
Normal file
7
WebBasedCrawler/youtube/youtubecrawl.py
Normal file
@@ -0,0 +1,7 @@
|
||||
|
||||
class YoutubeMainCrawl:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def start(self):
|
||||
pass
|
||||
Reference in New Issue
Block a user