Merge branch 'working/twitter'

This commit is contained in:
mjjo
2017-07-27 11:33:10 +09:00
29 changed files with 1221 additions and 15 deletions

View File

@@ -28,7 +28,11 @@ SOURCES += main.cpp\
skakaousermanage.cpp \ skakaousermanage.cpp \
sfacebooktagmanage.cpp \ sfacebooktagmanage.cpp \
sfacebookusermanage.cpp \ sfacebookusermanage.cpp \
snaverblogaccuracymanager.cpp snaverblogaccuracymanager.cpp \
stwittertagmanage.cpp \
stwitterusermanage.cpp \
syoutubetagmanage.cpp \
syoutubeusermanage.cpp
HEADERS += widget.h \ HEADERS += widget.h \
smanage.h \ smanage.h \
@@ -45,5 +49,9 @@ HEADERS += widget.h \
skakaousermanage.h \ skakaousermanage.h \
sfacebooktagmanage.h \ sfacebooktagmanage.h \
sfacebookusermanage.h \ sfacebookusermanage.h \
snaverblogaccuracymanage.h snaverblogaccuracymanage.h \
stwittertagmanage.h \
stwitterusermanage.h \
syoutubetagmanage.h \
syoutubeusermanage.h

View File

@@ -0,0 +1,89 @@
#include "stwittertagmanage.h"
#include <QThread>
#include "widget.h"
STwitterTagManage::STwitterTagManage(QObject *pObject) : SManage(pObject)
{
m_nID = 0;
connect(&m_pro[0], SIGNAL(readyReadStandardOutput()), this, SLOT(readStandardOutput()));
connect(&m_pro[0], SIGNAL(readyReadStandardError()), this, SLOT(readStandardError()));
}
void STwitterTagManage::Start()
{
m_nMode = E_PROCESS_RUN;
m_bFinalLast = false;
}
bool STwitterTagManage::Update()
{
if(m_bFinalLast) return m_bFinalLast;
switch(m_nMode)
{
case E_PROCESS_RUN:
if(UseProcess() == false)
{
#if defined(Q_OS_WIN32)
m_pro[0].start("python", QStringList() << "webbasedcrawler.py" << "twitter" << m_strKeywordID << m_strGroupID << m_pMain->StartDay() << m_pMain->UntilPage());
#else
m_pro[0].start("/usr/bin/python3", QStringList() << "webbasedcrawler.py" << "twitter" << m_strKeywordID << m_strGroupID << m_pMain->StartDay() << m_pMain->UntilPage());
#endif
m_nMode = E_PROCESS_FINISH_WAIT;
}
break;
case E_PROCESS_FINISH_WAIT:
break;
}
return m_bFinalLast;
}
void STwitterTagManage::processFinished(QProcess *pPro, QString _strOut)
{
switch(m_nMode)
{
case E_PROCESS_FINISH_WAIT:
m_nMode = E_PROCESS_RUN;
m_bFinalLast = true;
m_pMain->InsertLog("Finish Crawling :)");
m_pMain->SetCrawlingState("Finish");
m_ncList=1;
m_bLast = false;
break;
}
}
void STwitterTagManage::readStandardOutput()
{
QProcess *pPro = (QProcess*)sender();
QThread::msleep(100);
QString str = pPro->readAllStandardOutput();
QStringList list = str.split("\n", QString::SkipEmptyParts);
foreach(QString log,list)
{
if (m_pMain)
{
m_pMain->InsertLog(log);
}
else
exit(0);
}
}
void STwitterTagManage::readStandardError()
{
QProcess *pPro = (QProcess*)sender();
QThread::msleep(100);
QString str = pPro->readAllStandardError();
QStringList list = str.split("\n", QString::SkipEmptyParts);
foreach(QString log,list)
{
if (m_pMain)
{
m_pMain->InsertLog(log);
}
else
exit(0);
}
}

View File

@@ -0,0 +1,30 @@
#ifndef STWITTERTAGMANAGE_H
#define STWITTERTAGMANAGE_H
#include "smanage.h"
class STwitterTagManage : public SManage
{
Q_OBJECT
public:
enum E_PROCESS_STATE
{
E_PROCESS_RUN = 0,
E_PROCESS_FINISH_WAIT,
};
STwitterTagManage(QObject *pObject);
private:
QString makeGetListQuery(QString _str,QDate _date,int _nPage);
private:
QString m_strListQuery;
QVector <QString> m_strListURL;
protected:
bool Update();
void Start();
void processFinished(QProcess *pPro,QString _strOut);
void ReLoadList();
private slots:
void readStandardOutput();
void readStandardError();
};
#endif // STWITTERTAGMANAGE_H

View File

@@ -0,0 +1,89 @@
#include "stwitterusermanage.h"
#include <QThread>
#include "widget.h"
STwitterUserManage::STwitterUserManage(QObject *pObject) : SManage(pObject)
{
m_nID = 0;
connect(&m_pro[0], SIGNAL(readyReadStandardOutput()), this, SLOT(readStandardOutput()));
connect(&m_pro[0], SIGNAL(readyReadStandardError()), this, SLOT(readStandardError()));
}
void STwitterUserManage::Start()
{
m_nMode = E_PROCESS_RUN;
m_bFinalLast = false;
}
bool STwitterUserManage::Update()
{
if(m_bFinalLast) return m_bFinalLast;
switch(m_nMode)
{
case E_PROCESS_RUN:
if(UseProcess() == false)
{
#if defined(Q_OS_WIN32)
m_pro[0].start("python", QStringList() << "webbasedcrawler.py" << "twitter" << m_strKeywordID << m_strGroupID << m_pMain->StartDay() << m_pMain->UntilPage());
#else
m_pro[0].start("/usr/bin/python3", QStringList() << "webbasedcrawler.py" << "twitter" << m_strKeywordID << m_strGroupID << m_pMain->StartDay() << m_pMain->UntilPage());
#endif
m_nMode = E_PROCESS_FINISH_WAIT;
}
break;
case E_PROCESS_FINISH_WAIT:
break;
}
return m_bFinalLast;
}
void STwitterUserManage::processFinished(QProcess *pPro, QString _strOut)
{
switch(m_nMode)
{
case E_PROCESS_FINISH_WAIT:
m_nMode = E_PROCESS_RUN;
m_bFinalLast = true;
m_pMain->InsertLog("Finish Crawling :)");
m_pMain->SetCrawlingState("Finish");
m_ncList=1;
m_bLast = false;
break;
}
}
void STwitterUserManage::readStandardOutput()
{
QProcess *pPro = (QProcess*)sender();
QThread::msleep(100);
QString str = pPro->readAllStandardOutput();
QStringList list = str.split("\n", QString::SkipEmptyParts);
foreach(QString log,list)
{
if (m_pMain)
{
m_pMain->InsertLog(log);
}
else
exit(0);
}
}
void STwitterUserManage::readStandardError()
{
QProcess *pPro = (QProcess*)sender();
QThread::msleep(100);
QString str = pPro->readAllStandardError();
QStringList list = str.split("\n", QString::SkipEmptyParts);
foreach(QString log,list)
{
if (m_pMain)
{
m_pMain->InsertLog(log);
}
else
exit(0);
}
}

View File

@@ -0,0 +1,30 @@
#ifndef STWITTERUSERMANAGE_H
#define STWITTERUSERMANAGE_H
#include "smanage.h"
class STwitterUserManage : public SManage
{
Q_OBJECT
public:
enum E_PROCESS_STATE
{
E_PROCESS_RUN = 0,
E_PROCESS_FINISH_WAIT,
};
STwitterUserManage(QObject *pObject);
private:
QString makeGetListQuery(QString _str,QDate _date,int _nPage);
private:
QString m_strListQuery;
QVector <QString> m_strListURL;
protected:
bool Update();
void Start();
void processFinished(QProcess *pPro,QString _strOut);
void ReLoadList();
private slots:
void readStandardOutput();
void readStandardError();
};
#endif // STWITTERUSERMANAGE_H

View File

@@ -0,0 +1,89 @@
#include "syoutubetagmanage.h"
#include <QThread>
#include "widget.h"
SYoutubeTagManage::SYoutubeTagManage(QObject *pObject) : SManage(pObject)
{
m_nID = 0;
connect(&m_pro[0], SIGNAL(readyReadStandardOutput()), this, SLOT(readStandardOutput()));
connect(&m_pro[0], SIGNAL(readyReadStandardError()), this, SLOT(readStandardError()));
}
void SYoutubeTagManage::Start()
{
m_nMode = E_PROCESS_RUN;
m_bFinalLast = false;
}
bool SYoutubeTagManage::Update()
{
if(m_bFinalLast) return m_bFinalLast;
switch(m_nMode)
{
case E_PROCESS_RUN:
if(UseProcess() == false)
{
#if defined(Q_OS_WIN32)
m_pro[0].start("python", QStringList() << "webbasedcrawler.py" << "youtube" << m_strKeywordID << m_strGroupID << m_pMain->StartDay() << m_pMain->UntilPage());
#else
m_pro[0].start("/usr/bin/python3", QStringList() << "webbasedcrawler.py" << "youtube" << m_strKeywordID << m_strGroupID << m_pMain->StartDay() << m_pMain->UntilPage());
#endif
m_nMode = E_PROCESS_FINISH_WAIT;
}
break;
case E_PROCESS_FINISH_WAIT:
break;
}
return m_bFinalLast;
}
void SYoutubeTagManage::processFinished(QProcess *pPro, QString _strOut)
{
switch(m_nMode)
{
case E_PROCESS_FINISH_WAIT:
m_nMode = E_PROCESS_RUN;
m_bFinalLast = true;
m_pMain->InsertLog("Finish Crawling :)");
m_pMain->SetCrawlingState("Finish");
m_ncList=1;
m_bLast = false;
break;
}
}
void SYoutubeTagManage::readStandardOutput()
{
QProcess *pPro = (QProcess*)sender();
QThread::msleep(100);
QString str = pPro->readAllStandardOutput();
QStringList list = str.split("\n", QString::SkipEmptyParts);
foreach(QString log,list)
{
if (m_pMain)
{
m_pMain->InsertLog(log);
}
else
exit(0);
}
}
void SYoutubeTagManage::readStandardError()
{
QProcess *pPro = (QProcess*)sender();
QThread::msleep(100);
QString str = pPro->readAllStandardError();
QStringList list = str.split("\n", QString::SkipEmptyParts);
foreach(QString log,list)
{
if (m_pMain)
{
m_pMain->InsertLog(log);
}
else
exit(0);
}
}

View File

@@ -0,0 +1,30 @@
#ifndef STYOUTUBETAGMANAGE_H
#define STYOUTUBETAGMANAGE_H
#include "smanage.h"
class SYoutubeTagManage : public SManage
{
Q_OBJECT
public:
enum E_PROCESS_STATE
{
E_PROCESS_RUN = 0,
E_PROCESS_FINISH_WAIT,
};
SYoutubeTagManage(QObject *pObject);
private:
QString makeGetListQuery(QString _str,QDate _date,int _nPage);
private:
QString m_strListQuery;
QVector <QString> m_strListURL;
protected:
bool Update();
void Start();
void processFinished(QProcess *pPro,QString _strOut);
void ReLoadList();
private slots:
void readStandardOutput();
void readStandardError();
};
#endif // STYOUTUBETAGMANAGE_H

View File

@@ -0,0 +1,89 @@
#include "syoutubeusermanage.h"
#include <QThread>
#include "widget.h"
SYoutubeUserManage::SYoutubeUserManage(QObject *pObject) : SManage(pObject)
{
m_nID = 0;
connect(&m_pro[0], SIGNAL(readyReadStandardOutput()), this, SLOT(readStandardOutput()));
connect(&m_pro[0], SIGNAL(readyReadStandardError()), this, SLOT(readStandardError()));
}
void SYoutubeUserManage::Start()
{
m_nMode = E_PROCESS_RUN;
m_bFinalLast = false;
}
bool SYoutubeUserManage::Update()
{
if(m_bFinalLast) return m_bFinalLast;
switch(m_nMode)
{
case E_PROCESS_RUN:
if(UseProcess() == false)
{
#if defined(Q_OS_WIN32)
m_pro[0].start("python", QStringList() << "webbasedcrawler.py" << "youtube" << m_strKeywordID << m_strGroupID << m_pMain->StartDay() << m_pMain->UntilPage());
#else
m_pro[0].start("/usr/bin/python3", QStringList() << "webbasedcrawler.py" << "youtube" << m_strKeywordID << m_strGroupID << m_pMain->StartDay() << m_pMain->UntilPage());
#endif
m_nMode = E_PROCESS_FINISH_WAIT;
}
break;
case E_PROCESS_FINISH_WAIT:
break;
}
return m_bFinalLast;
}
void SYoutubeUserManage::processFinished(QProcess *pPro, QString _strOut)
{
switch(m_nMode)
{
case E_PROCESS_FINISH_WAIT:
m_nMode = E_PROCESS_RUN;
m_bFinalLast = true;
m_pMain->InsertLog("Finish Crawling :)");
m_pMain->SetCrawlingState("Finish");
m_ncList=1;
m_bLast = false;
break;
}
}
void SYoutubeUserManage::readStandardOutput()
{
QProcess *pPro = (QProcess*)sender();
QThread::msleep(100);
QString str = pPro->readAllStandardOutput();
QStringList list = str.split("\n", QString::SkipEmptyParts);
foreach(QString log,list)
{
if (m_pMain)
{
m_pMain->InsertLog(log);
}
else
exit(0);
}
}
void SYoutubeUserManage::readStandardError()
{
QProcess *pPro = (QProcess*)sender();
QThread::msleep(100);
QString str = pPro->readAllStandardError();
QStringList list = str.split("\n", QString::SkipEmptyParts);
foreach(QString log,list)
{
if (m_pMain)
{
m_pMain->InsertLog(log);
}
else
exit(0);
}
}

View File

@@ -0,0 +1,30 @@
#ifndef YOUTUBE_USER_MANAGE_H
#define YOUTUBE_USER_MANAGE_H
#include "smanage.h"
class SYoutubeUserManage : public SManage
{
Q_OBJECT
public:
enum E_PROCESS_STATE
{
E_PROCESS_RUN = 0,
E_PROCESS_FINISH_WAIT,
};
SYoutubeUserManage(QObject *pObject);
private:
QString makeGetListQuery(QString _str,QDate _date,int _nPage);
private:
QString m_strListQuery;
QVector <QString> m_strListURL;
protected:
bool Update();
void Start();
void processFinished(QProcess *pPro,QString _strOut);
void ReLoadList();
private slots:
void readStandardOutput();
void readStandardError();
};
#endif // YOUTUBE_USER_MANAGE_H

View File

@@ -22,6 +22,10 @@
#include "sfacebooktagmanage.h" #include "sfacebooktagmanage.h"
#include "sfacebookusermanage.h" #include "sfacebookusermanage.h"
#include "snaverblogaccuracymanage.h" #include "snaverblogaccuracymanage.h"
#include "stwittertagmanage.h"
#include "stwitterusermanage.h"
#include "syoutubetagmanage.h"
#include "syoutubeusermanage.h"
#include <QApplication> #include <QApplication>
#include <QLabel> #include <QLabel>
@@ -51,6 +55,10 @@ Widget::Widget(QWidget *parent) : QWidget(parent) , m_nMode(E_MODE_WAIT)
m_pFacebookTag = new SFacebookTagManage(this); m_pFacebookTag = new SFacebookTagManage(this);
m_pFacebookUser = new SFacebookUserManage(this); m_pFacebookUser = new SFacebookUserManage(this);
m_pNaverBlogAccuracy = new SNaverBlogAccuracyManage(this); m_pNaverBlogAccuracy = new SNaverBlogAccuracyManage(this);
m_pTwitterTag = new STwitterTagManage(this);
m_pTwitterUser = new STwitterUserManage(this);
m_pYoutubeTag = new SYoutubeTagManage(this);
m_pYoutubeUser = new SYoutubeUserManage(this);
m_pManage[0] = m_pNaverCafe; m_pManage[0] = m_pNaverCafe;
m_pManage[1] = m_pNaverBlog; m_pManage[1] = m_pNaverBlog;
@@ -66,6 +74,10 @@ Widget::Widget(QWidget *parent) : QWidget(parent) , m_nMode(E_MODE_WAIT)
m_pManage[11] = m_pFacebookTag; m_pManage[11] = m_pFacebookTag;
m_pManage[12] = m_pFacebookUser; m_pManage[12] = m_pFacebookUser;
m_pManage[13] = m_pNaverBlogAccuracy; m_pManage[13] = m_pNaverBlogAccuracy;
m_pManage[14] = m_pTwitterTag;
m_pManage[15] = m_pTwitterUser;
m_pManage[16] = m_pYoutubeTag;
m_pManage[17] = m_pYoutubeUser;
m_db = QSqlDatabase::addDatabase("QMYSQL"); m_db = QSqlDatabase::addDatabase("QMYSQL");
m_db.setHostName("bigbird.iptime.org"); m_db.setHostName("bigbird.iptime.org");

View File

@@ -26,6 +26,10 @@ class SInstaUserManage;
class SFacebookTagManage; class SFacebookTagManage;
class SFacebookUserManage; class SFacebookUserManage;
class SNaverBlogAccuracyManage; class SNaverBlogAccuracyManage;
class STwitterTagManage;
class STwitterUserManage;
class SYoutubeTagManage;
class SYoutubeUserManage;
#define SAFE_DELETE(p) {if(p) delete (p); (p) = NULL; } #define SAFE_DELETE(p) {if(p) delete (p); (p) = NULL; }
@@ -55,7 +59,7 @@ private:
QLineEdit *m_pedStartDay; QLineEdit *m_pedStartDay;
QTimer m_timer,m_timerAlive; QTimer m_timer,m_timerAlive;
QSqlDatabase m_db; QSqlDatabase m_db;
static const int C_PLATFORM_MAX = 14; static const int C_PLATFORM_MAX = 18;
SManage *m_pManage[C_PLATFORM_MAX]; SManage *m_pManage[C_PLATFORM_MAX];
QListWidget *m_pResultList; QListWidget *m_pResultList;
QString m_strFileName; QString m_strFileName;
@@ -75,6 +79,11 @@ private:
SFacebookTagManage *m_pFacebookTag; SFacebookTagManage *m_pFacebookTag;
SFacebookUserManage *m_pFacebookUser; SFacebookUserManage *m_pFacebookUser;
SNaverBlogAccuracyManage *m_pNaverBlogAccuracy; SNaverBlogAccuracyManage *m_pNaverBlogAccuracy;
STwitterTagManage* m_pTwitterTag;
STwitterUserManage* m_pTwitterUser;
SYoutubeTagManage* m_pYoutubeTag;
SYoutubeUserManage* m_pYoutubeUser;
int m_nStartTime,m_nRangeTime,m_nPlatform; int m_nStartTime,m_nRangeTime,m_nPlatform;
//QGroupBox *m_pgbManual; //QGroupBox *m_pgbManual;
QCheckBox *m_pcheckboxReal; QCheckBox *m_pcheckboxReal;

View File

@@ -1,4 +1,4 @@
#include "scrawler.h" #include "scrawler.h"
#include <QCoreApplication> #include <QCoreApplication>
#include <iostream> #include <iostream>

View File

@@ -1,4 +1,4 @@
#include "scrawler.h" #include "scrawler.h"
#include <iostream> #include <iostream>
#include <QSqlQuery> #include <QSqlQuery>
#include <QSqlError> #include <QSqlError>

View File

@@ -1,4 +1,4 @@
#ifndef SCRAWLER_H #ifndef SCRAWLER_H
#define SCRAWLER_H #define SCRAWLER_H
#include <QtWebKitWidgets> #include <QtWebKitWidgets>

View File

@@ -1,4 +1,4 @@
#ifndef SCRAWLERDATA #ifndef SCRAWLERDATA
#define SCRAWLERDATA #define SCRAWLERDATA
#endif // SCRAWLERDATA #endif // SCRAWLERDATA

View File

@@ -85,6 +85,10 @@ Widget::Widget(QWidget *parent)
"WHEN 11 THEN 'Facebook Tag' " "WHEN 11 THEN 'Facebook Tag' "
"WHEN 12 THEN 'Facebook User' " "WHEN 12 THEN 'Facebook User' "
"WHEN 13 THEN 'Naver Blog Accuracy' " "WHEN 13 THEN 'Naver Blog Accuracy' "
"WHEN 14 THEN 'Twitter Tag' "
"WHEN 15 THEN 'Twitter User' "
"WHEN 16 THEN 'Youtube Tag' "
"WHEN 17 THEN 'Youtube User' "
"ELSE 'UnKnown'" "ELSE 'UnKnown'"
"END AS platform FROM keyword where state is null"); "END AS platform FROM keyword where state is null");
m_pmodelGroup->setQuery("SELECT * FROM datagroup"); m_pmodelGroup->setQuery("SELECT * FROM datagroup");
@@ -140,7 +144,7 @@ QGroupBox *Widget::setKeywordWidgets()
m_pcbPlatform = new QComboBox; m_pcbPlatform = new QComboBox;
m_pcbPlatform->addItems(QStringList() << "Naver Cafe" << "Naver Blog" << "Daum Cafe" << "Naver News" << "Naver Cafe List" << "Daum Cafe List" m_pcbPlatform->addItems(QStringList() << "Naver Cafe" << "Naver Blog" << "Daum Cafe" << "Naver News" << "Naver Cafe List" << "Daum Cafe List"
<< "Kakao Story Channel" << "Kakao Story Tag" << "Kakao Story User" << "Instagram Tag" << "Instagram User" << "Kakao Story Channel" << "Kakao Story Tag" << "Kakao Story User" << "Instagram Tag" << "Instagram User"
<< "Facebook Tag" << "Facebook User" << "Naver Blog Accuracy"); << "Facebook Tag" << "Facebook User" << "Naver Blog Accuracy" << "Twitter Tag" << "Twitter User" << "Youtube Tag" << "Youtube User");
m_pleKeyword = new QLineEdit; m_pleKeyword = new QLineEdit;
m_pleAuthorship = new QLineEdit; m_pleAuthorship = new QLineEdit;
@@ -380,6 +384,10 @@ void Widget::on_keyword_currentRowChanged(QModelIndex _index)
if (str == QString("Facebook Tag")) nSelect = 11; if (str == QString("Facebook Tag")) nSelect = 11;
if (str == QString("Facebook User")) nSelect = 12; if (str == QString("Facebook User")) nSelect = 12;
if (str == QString("Naver Blog Accuracy")) nSelect = 13; if (str == QString("Naver Blog Accuracy")) nSelect = 13;
if (str == QString("Twitter Tag")) nSelect = 14;
if (str == QString("Twitter User")) nSelect = 15;
if (str == QString("Youtube Tag")) nSelect = 16;
if (str == QString("Youtube User")) nSelect = 17;
m_pcbPlatform->setCurrentIndex(nSelect); m_pcbPlatform->setCurrentIndex(nSelect);
} }
} }
@@ -504,6 +512,10 @@ void Widget::on_keyword_button_insert()
"WHEN 11 THEN 'Facebook Tag' " "WHEN 11 THEN 'Facebook Tag' "
"WHEN 12 THEN 'Facebook User' " "WHEN 12 THEN 'Facebook User' "
"WHEN 13 THEN 'Naver Blog Accuracy' " "WHEN 13 THEN 'Naver Blog Accuracy' "
"WHEN 14 THEN 'Twitter Tag' "
"WHEN 15 THEN 'Twitter User' "
"WHEN 16 THEN 'Youtube Tag' "
"WHEN 17 THEN 'Youtube User' "
"ELSE 'UnKnown'" "ELSE 'UnKnown'"
"END AS platform FROM keyword where state is null"); "END AS platform FROM keyword where state is null");
} }
@@ -535,6 +547,10 @@ void Widget::on_keyword_button_delete()
"WHEN 11 THEN 'Facebook Tag' " "WHEN 11 THEN 'Facebook Tag' "
"WHEN 12 THEN 'Facebook User' " "WHEN 12 THEN 'Facebook User' "
"WHEN 13 THEN 'Naver Blog Accuracy' " "WHEN 13 THEN 'Naver Blog Accuracy' "
"WHEN 14 THEN 'Twitter Tag' "
"WHEN 15 THEN 'Twitter User' "
"WHEN 16 THEN 'Youtube Tag' "
"WHEN 17 THEN 'Youtube User' "
"ELSE 'UnKnown'" "ELSE 'UnKnown'"
"END AS platform FROM keyword where state is null"); "END AS platform FROM keyword where state is null");
} }
@@ -576,6 +592,10 @@ void Widget::on_keyword_button_modify()
"WHEN 11 THEN 'Facebook Tag' " "WHEN 11 THEN 'Facebook Tag' "
"WHEN 12 THEN 'Facebook User' " "WHEN 12 THEN 'Facebook User' "
"WHEN 13 THEN 'Naver Blog Accuracy' " "WHEN 13 THEN 'Naver Blog Accuracy' "
"WHEN 14 THEN 'Twitter Tag' "
"WHEN 15 THEN 'Twitter User' "
"WHEN 16 THEN 'Youtube Tag' "
"WHEN 17 THEN 'Youtube User' "
"ELSE 'UnKnown'" "ELSE 'UnKnown'"
"END AS platform FROM keyword where state is null"); "END AS platform FROM keyword where state is null");
} }
@@ -1100,7 +1120,14 @@ void Widget::on_group_button_copy_start()
void Widget::UpdateCrawling() void Widget::UpdateCrawling()
{ {
m_pmodelCrawling->setQuery("SELECT _crawling.id,_keyword.realtime,_keyword.searches,_keyword.start,_keyword.end, _datagroup.name , " m_pmodelCrawling->setQuery("SELECT _crawling.id,_keyword.realtime,_keyword.searches,_keyword.start,_keyword.end, _datagroup.name , "
"(CASE _keyword.platform WHEN 0 THEN 'Naver Cafe' WHEN 1 THEN 'Naver Blog' WHEN 2 THEN 'Daum Cafe' WHEN 3 THEN 'Naver News' WHEN 4 THEN 'Naver Cafe List' WHEN 5 THEN 'Daum Cafe List' WHEN 6 THEN 'Kakao Story Channel' " "(CASE _keyword.platform "
"WHEN 0 THEN 'Naver Cafe' "
"WHEN 1 THEN 'Naver Blog' "
"WHEN 2 THEN 'Daum Cafe' "
"WHEN 3 THEN 'Naver News' "
"WHEN 4 THEN 'Naver Cafe List' "
"WHEN 5 THEN 'Daum Cafe List' "
"WHEN 6 THEN 'Kakao Story Channel' "
"WHEN 7 THEN 'Kakao Story Tag' " "WHEN 7 THEN 'Kakao Story Tag' "
"WHEN 8 THEN 'Kakao Story User' " "WHEN 8 THEN 'Kakao Story User' "
"WHEN 9 THEN 'Instagram Tag' " "WHEN 9 THEN 'Instagram Tag' "
@@ -1108,6 +1135,10 @@ void Widget::UpdateCrawling()
"WHEN 11 THEN 'Facebook Tag' " "WHEN 11 THEN 'Facebook Tag' "
"WHEN 12 THEN 'Facebook User' " "WHEN 12 THEN 'Facebook User' "
"WHEN 13 THEN 'Naver Blog Accuracy' " "WHEN 13 THEN 'Naver Blog Accuracy' "
"WHEN 14 THEN 'Twitter Tag' "
"WHEN 15 THEN 'Twitter User' "
"WHEN 16 THEN 'Youtube Tag' "
"WHEN 17 THEN 'Youtube User' "
"ELSE 'UnKnown' END ) AS platform , " "ELSE 'UnKnown' END ) AS platform , "
"(CASE _crawling.state WHEN 0 THEN 'Waiting' WHEN 1 THEN 'Running' WHEN 2 THEN 'Terminated' ELSE 'None' END ) AS state " "(CASE _crawling.state WHEN 0 THEN 'Waiting' WHEN 1 THEN 'Running' WHEN 2 THEN 'Terminated' ELSE 'None' END ) AS state "
"FROM crawling _crawling INNER JOIN keyword _keyword ON _crawling.keyword_id = _keyword.id " "FROM crawling _crawling INNER JOIN keyword _keyword ON _crawling.keyword_id = _keyword.id "

View File

@@ -32,6 +32,7 @@ def is_debugger_attached():
is_debug = is_debugger_attached() is_debug = is_debugger_attached()
def printl(*objects, sep=' ', end='\n', file=None, flush=True): def printl(*objects, sep=' ', end='\n', file=None, flush=True):
if is_debug: if is_debug:
cur_frame = inspect.currentframe() cur_frame = inspect.currentframe()

View File

@@ -0,0 +1,79 @@
from pymysql.connections import Connection
import datetime
from numbers import Number
class DataDBRow:
def __init__(self):
self.platform_name = None
self.platform_form = None
self.platform_title = None
self.article_form = None
self.article_parent = None
self.article_id = None
self.article_nickname = None
self.article_title = None
self.article_data = None
self.article_url = None
self.article_hit = 0
self.article_date = None
self.article_order = 0
self.article_profile = None
self.article_profileurl = None
self.platform_id = None
self.keyword_id = -1
self.reply_url = None
self.etc = None
def get_keys(self):
inst = DataDBRow()
keys = ()
for key, value_type in inst.__dict__.items():
if key.startswith('__') or callable(value_type):
continue
keys += key,
return keys
def get_values(self, conn, db_num):
inst = DataDBRow()
values = ()
for key, value_type in inst.__dict__.items():
if key.startswith('__') or callable(value_type):
continue
value = self.__dict__[key]
if isinstance(value, Number):
values += str(value),
elif isinstance(value, str):
values += conn.escape(value.encode('utf8').decode('utf8')),
else:
values += conn.escape(value),
return values
def get_insert_query(self, conn, db_num):
inst = DataDBRow()
keys = ''
values = ''
for key, value_type in inst.__dict__.items():
if key.startswith('__') or callable(value_type):
continue
if len(keys) > 0:
keys += ', '
values += ', '
keys += key
value = self.__dict__[key]
if isinstance(value, Number):
values += str(value)
elif isinstance(value, str):
values += conn.escape(value.encode('utf8').decode('utf8'))
else:
values += conn.escape(value)
query = 'insert into data_{} ({}) values ({})'.format(db_num, keys, values)
return query

View File

@@ -97,6 +97,31 @@ def get_driver(platform, proxies):
else: else:
return platform_webdriver[platform](capabilities=desired_capabilities) return platform_webdriver[platform](capabilities=desired_capabilities)
_expired_proxies = []
def set_proxy_expired(proxy):
if proxy not in _expired_proxies:
_expired_proxies.append(proxy)
address = proxy['http'][len('http://'):]
with open(proxy_filename, 'r') as f:
lines = f.readlines()
expired_idx = -1
for idx, line in enumerate(lines):
if line.startswith(address):
expired_idx = idx
break
if expired_idx >= 0:
lines[expired_idx] = '# ' + lines[expired_idx]
lines.append(lines.pop(expired_idx))
with open(proxy_filename, 'w') as f:
f.writelines(lines)
def get_proxy_from_file(filename): def get_proxy_from_file(filename):
""" """
@@ -104,7 +129,7 @@ def get_proxy_from_file(filename):
:return (ip, port): string, string :return (ip, port): string, string
if ip, port or filename is invalid, return (None, None) if ip, port or filename is invalid, return (None, None)
""" """
proxy_lists = [line.replace('\n', '') for line in open(filename) if re_ip.search(line)] proxy_lists = [line.replace('\n', '') for line in open(filename) if not line.strip().startswith('#') and re_ip.search(line)]
if proxy_lists: if proxy_lists:
m = re_ip.search(proxy_lists[random.randint(0, len(proxy_lists) - 1)]) m = re_ip.search(proxy_lists[random.randint(0, len(proxy_lists) - 1)])
if m: if m:

View File

@@ -0,0 +1,3 @@
requests
bs4
pytz

View File

View File

@@ -0,0 +1,62 @@
import datetime
import copy
class TwitterConfig:
protocol = 'https'
top_url = 'twitter.com'
search_url = '/i/search/timeline'
conversation_url_form = '/i/{}/conversation/{}'
def __init__(self):
self.keyword_id = -1
self.db_num = -1
self.id = 0
self.realtime = False
self.keywords = []
self.start_str = None
self.start = None
self.end_str = None
self.end = None
self.authorship = None
self.state = None
self.platform = None
def set_param(self, keyword_id, db_num, params):
self.keyword_id = int(keyword_id)
self.db_num = int(db_num)
self.id = int(params['id'])
self.realtime = params['realtime'] == '1'
self.keywords = []
for keyword in params['searches'].split(','):
self.keywords.append(keyword.strip())
self.start_str = str(params['start'])
self.end_str = str(params['end'])
self.start = datetime.datetime.combine(params['start'], datetime.datetime.min.time())
self.end = datetime.datetime.combine(params['end'], datetime.datetime.min.time())
self.authorship = params['authorship']
self.state = params['state']
self.platform = params['platform']
def split(self):
split_list = []
new_end = self.end
while new_end > self.start:
new_config = copy.deepcopy(self)
new_config.end = new_end
new_end = new_end + datetime.timedelta(days=-1)
new_config.start = new_end
new_config.start_str = new_config.start.strftime('%Y-%m-%d')
new_config.end_str = new_config.end.strftime('%Y-%m-%d')
split_list.append(new_config)
return split_list

View File

@@ -0,0 +1,79 @@
from twitter.tweet import Tweet
import multiprocessing as mp
class TwitterDBHelper:
pymysql = __import__('pymysql.cursors')
def __init__(self):
self.tweets = []
self.buffer = []
self.lock = mp.Lock()
pass
def __del__(self):
pass
def get_param(self, keyword_id):
query = "select * from keyword where id = " + str(keyword_id)
params = []
try:
conn = self.pymysql.connect(host='bigbird.iptime.org',
user='admin', passwd='admin123',
db='concepters', charset='utf8',
cursorclass=self.pymysql.cursors.DictCursor)
with conn.cursor() as cursor:
cursor.execute(query)
params = cursor.fetchone()
except Exception as e:
print(e)
exit(1)
else:
conn.close()
return params
def insert_tweet(self, tweet: Tweet = None, db_num: int = -1, flush=False):
# self.lock.acquire()
# if tweet is not None:
# self.buffer.append((tweet, db_num, ))
#
# local_buffer = None
# if len(self.buffer) >= 100 or flush:
# local_buffer = copy.deepcopy(self.buffer)
# self.buffer.clear()
# self.lock.release()
local_buffer = [(tweet, db_num, )]
if local_buffer:
while True:
try:
conn = self.pymysql.connect(host='bigbird.iptime.org',
user='admin', passwd='admin123',
db='concepters', charset='utf8',
cursorclass=self.pymysql.cursors.DictCursor,
connect_timeout=5)
except Exception as e:
print(e)
continue
else:
break
try:
with conn.cursor() as cursor:
for tweet, _db_num in local_buffer:
query = tweet.get_insert_query(conn, _db_num)
cursor.execute(query)
conn.commit()
except Exception as e:
print(e)
finally:
conn.close()

View File

@@ -0,0 +1,24 @@
from base.dbdata import DataDBRow
class Tweet(DataDBRow):
def __init__(self):
super(self.__class__, self).__init__()
self.tweet_id = None
self.user_id = None
self.user_name = None
self.text = None
self.created_at = None
self.retweets = 0
self.favorites = 0
self.is_reply = False
self.reply_cnt = 0
self.retweet_cnt = 0
self.favorite_cnt = 0
self.top_link = None
self.tweet_link = None
self.depth = 0

View File

@@ -0,0 +1,289 @@
from twitter.twconfig import TwitterConfig
from twitter.twdbhelper import TwitterDBHelper
from twitter.tweet import Tweet
from twitter.twparser import TweetParser
import base.proxy
import base.baseclasses
import requests
import bs4
import json
import urllib
import threading
import queue
import time
class TwitterCrawler():
def __init__(self):
self.default_config = TwitterConfig()
self.db_helper = TwitterDBHelper()
def set_arguments(self, browser, keyword_id, db_num, before_day, until_page):
params = self.db_helper.get_param(keyword_id)
self.default_config.set_param(keyword_id, db_num, params)
@staticmethod
def get_timeline_url(query, start_str, end_str, max_position=''):
params = {
'f': 'tweets',
'vertical': 'default',
'src': 'typd',
'q': '{} since:{} until:{}'.format(query, start_str, end_str),
'language': 'en',
'max_position': max_position,
}
url_tupple = (TwitterConfig.protocol, TwitterConfig.top_url, TwitterConfig.search_url, '', urllib.parse.urlencode(params), '')
return urllib.parse.urlunparse(url_tupple)
@staticmethod
def get_content_url(user_id, tweet_id, max_position=''):
params = {
'max_position': max_position,
}
sub_url = TwitterConfig.conversation_url_form.format(user_id, tweet_id)
url_tupple = (TwitterConfig.protocol, TwitterConfig.top_url, sub_url, '', urllib.parse.urlencode(params), '')
return urllib.parse.urlunparse(url_tupple)
@staticmethod
def get_page(url, proc_id):
headers = {
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36',
'Accept-Language': 'ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4',
}
# if proxies is None:
proxies = base.proxy.get_proxy_for_requests()
resp = None
while True:
try:
resp = requests.get(url, headers=headers, proxies=proxies, timeout=3)
except Exception as e:
if proxies == (None, None):
break
print('[{}] proxy {} is expired. ({})'.format(proc_id, proxies, e))
base.proxy.set_proxy_expired(proxies)
proxies = base.proxy.get_proxy_for_requests()
else:
break
return resp
def runner_proc(self, proc_id, content_queue, result_queue, config):
print('{} to {} runner thread start'.format(config.start_str, config.end_str))
b_continue = True
min_tweet_id = None
max_tweet_id = None
max_position = ''
tweet_count = 0
while b_continue:
if min_tweet_id is not None:
max_position = 'TWEET-{}-{}'.format(max_tweet_id, min_tweet_id)
url = self.get_timeline_url(config.keywords[0], config.start_str, config.end_str, max_position)
resp = self.get_page(url, proc_id)
if resp is None:
break
j = json.loads(resp.content.decode('utf-8'))
soup = bs4.BeautifulSoup(j['items_html'], 'lxml')
tweet_tags = soup.select("div.tweet")
for tw in tweet_tags:
tweet = TweetParser.parse(tw, config.keyword_id)
if tweet.is_reply is True:
# print(' ## {}: {}...'.format(tweet.user_name, tweet.text[:20]))
continue
if tweet.reply_cnt > 0:
self.insert_content_pool(proc_id, content_queue, tweet, tweet)
self.db_helper.insert_tweet(tweet, config.db_num)
# print('{} {}: {}...'.format(tweet.created_at, tweet.user_name, tweet.text[:20]))
count = len(tweet_tags)
if count == 0:
break
if min_tweet_id is None:
min_tweet_id = tweet_tags[0].attrs['data-item-id']
max_tweet_id = tweet_tags[-1].attrs['data-item-id']
tweet_count += count
print('{} to {} runner thread finished {}'.format(config.start_str, config.end_str, tweet_count))
result_queue.put((proc_id, tweet_count, ))
# self.runner_processing[proc_id].value = False
return proc_id, tweet_count,
@staticmethod
def insert_content_pool(proc_id: int, qu, tweet: Tweet, tweet_top: Tweet):
# print(' [{}] pool insert: {} ({})'.format(proc_id, tweet.text[:20] if tweet.text else '', tweet.tweet_link))
qu.put((tweet, tweet_top,))
@staticmethod
def get_content(content_queue):
sleep_time = time.time()
while True:
try:
parent_tw, top_tw, = content_queue.get(block=True, timeout=2)
except Exception as e:
if time.time()-sleep_time > 60:
break
else:
continue
else:
return parent_tw, top_tw,
return None, None,
def content_proc(self, proc_id, content_queue, result_queue):
print('[{}] content thread start'.format(proc_id))
tweet_count = 0
while True:
parent_tw, top_tw, = self.get_content(content_queue)
if not parent_tw:
break
# print(' [{}] <<< parent : {} ({})'.format(proc_id, parent_tw.text[:20], parent_tw.tweet_link))
max_position = ''
b_continue = True
while b_continue:
url = self.get_content_url(parent_tw.user_id, parent_tw.tweet_id, max_position)
resp = self.get_page(url, proc_id)
if resp is None or resp.status_code == 404:
break
elif resp.status_code != 200:
print('[WARNING] content_get code {}'.format(resp.status_code))
continue
j = json.loads(resp.content.decode('utf-8'))
soup = bs4.BeautifulSoup(j['items_html'], 'lxml')
reply_container_tags = soup.select('li.ThreadedConversation')
reply_container_tags += TweetParser.get_lone_container(soup, parent_tw)
for container_tags in reply_container_tags:
tweet_tags = container_tags.select('div.tweet')
if len(tweet_tags) > 0:
tweet = TweetParser.parse(tweet_tags[0], self.default_config.keyword_id, parent_tw.depth+1, top_tw)
# print('[{}]>>> {} {}: {} ({}) ({})'.format(proc_id, tweet.created_at, tweet.user_name, tweet.text[:20], tweet.depth, tweet.tweet_link))
self.insert_content_pool(proc_id, content_queue, tweet, top_tw)
self.db_helper.insert_tweet(tweet, self.default_config.db_num)
tweet_count += 1
b_continue = j['has_more_items']
if b_continue:
max_position = j['min_position']
result_queue.put((proc_id, tweet_count))
print('[{}] content thread finished'.format(proc_id))
return proc_id, tweet_count,
def debug_content(self):
content_qu = queue.Queue()
runner_result_qu = queue.Queue()
content_result_qu = queue.Queue()
test_tw = Tweet()
# test_tw.tweet_link = 'https://twitter.com/yniold_/status/886863893137678337'
# test_tw.user_id = 'yniold_'
# test_tw.tweet_id = 886863893137678337
test_tw.tweet_link = 'https://twitter.com/Awesome_vely/status/888704413111435264'
test_tw.user_id = 'Awesome_vely'
test_tw.tweet_id = 888704413111435264
test_tw.text = '시작'
self.insert_content_pool(0, content_qu, test_tw, test_tw)
content_threads = [threading.Thread(target=self.content_proc, args=(proc_id, content_qu, content_result_qu)) for proc_id in range(16)]
[th.start() for th in content_threads]
[th.join() for th in content_threads]
while not content_result_qu.empty():
res = content_result_qu.get()
print('reply : {}'.format(res))
print('end all')
def test_insert_db(self):
test_tw = Tweet()
test_tw.tweet_link = 'https://twitter.com/moonriver365/status/885797401033818112'
test_tw.user_id = 'moonriver365'
test_tw.tweet_id = 885797401033818112
for _ in range(5):
self.db_helper.insert_tweet(test_tw, self.default_config.db_num)
def debug(self):
if base.baseclasses.is_debug:
## check proxy
# base.proxy.get_proxy_from_file('proxy.txt')
# proxy = {'https': 'http://45.56.86.93:3128', 'http': 'http://45.56.86.93:3128'}
# base.proxy.set_proxy_expired(proxy)
# return
## contents check
self.debug_content()
# split_config = self.default_config.split()
# self.test_insert_db()
print("debug end")
# exit()
def start(self):
start_time = time.time()
# self.debug()
# return
# run
split_config = self.default_config.split()
content_qu = queue.Queue()
runner_result_qu = queue.Queue()
content_result_qu = queue.Queue()
runner_threads = [threading.Thread(target=self.runner_proc, args=(proc_id, content_qu, runner_result_qu, config)) for proc_id, config in enumerate(split_config)]
content_threads = [threading.Thread(target=self.content_proc, args=(proc_id, content_qu, content_result_qu)) for proc_id in range(16)]
[th.start() for th in runner_threads]
[th.start() for th in content_threads]
[th.join() for th in runner_threads]
[th.join() for th in content_threads]
# rerun zero runners
runner_threads = []
runner_result_qu2 = queue.Queue()
idx = 0
while not runner_result_qu.empty():
res = runner_result_qu.get()
if res == 0:
th = threading.Thread(target=self.runner_proc, args=(idx, content_qu, runner_result_qu2, split_config[idx]))
runner_threads.append(th)
idx += 1
content_threads = [threading.Thread(target=self.content_proc, args=(proc_id, content_qu, content_result_qu)) for proc_id in range(16)]
[th.start() for th in runner_threads]
[th.start() for th in content_threads]
[th.join() for th in runner_threads]
[th.join() for th in content_threads]
# print running time
delta = time.time() - start_time
m, s = divmod(delta, 60)
h, m = divmod(m, 60)
print("finished all {}:{:02d}:{:02d} ".format(int(h), int(m), int(s)))

View File

@@ -0,0 +1,96 @@
from twitter.tweet import Tweet
from twitter.twconfig import TwitterConfig
import bs4
import datetime
import pytz
class TweetParser:
@staticmethod
def parse(tag, keyword_id, depth=0, top_tw: Tweet=None):
tweet = Tweet()
tweet.tweet_id = int(tag.attrs['data-tweet-id'])
nickname_tag = tag.select('strong.fullname')[0]
tweet.user_name = ''
for child in nickname_tag.children:
if isinstance(child, bs4.element.NavigableString):
if len(tweet.user_name) > 0:
tweet.user_name += ' '
tweet.user_name += child
tweet.user_id = tag.select('span.username')[0].text[1:]
tweet.text = tag.select('p.tweet-text')[0].text
# time_str = tag.select('a.tweet-timestamp')[0].attrs['title']
# english
# tweet.created_at = datetime.datetime.strptime(time_str, '%I:%M %p - %d %b %Y')
# korean
# time_str = time_str.replace('오전', 'AM').replace('오후', 'PM')
# tweet.created_at = datetime.datetime.strptime(time_str, '%p %I:%M - %Y년 %m월 %d일')
timestamp = int(tag.select('span._timestamp')[0].attrs['data-time'])
utc_dt = datetime.datetime.utcfromtimestamp(timestamp)
local_tz = pytz.timezone('Asia/Seoul')
local_dt = utc_dt.replace(tzinfo=pytz.utc).astimezone(local_tz)
tweet.created_at = local_tz.normalize(local_dt)
reply_tag = tag.select('div.ReplyingToContextBelowAuthor')
tweet.is_reply = len(reply_tag) > 0
reply_cnt_tag = tag.select('span.ProfileTweet-action--reply > span.ProfileTweet-actionCount')
if len(reply_cnt_tag) > 0:
tweet.reply_cnt = int(reply_cnt_tag[0].attrs['data-tweet-stat-count'])
retweet_cnt_tag = tag.select('span.ProfileTweet-action--retweet > span.ProfileTweet-actionCount')
if len(retweet_cnt_tag) > 0:
tweet.retweet_cnt = int(retweet_cnt_tag[0].attrs['data-tweet-stat-count'])
favorite_cnt_tag = tag.select('span.ProfileTweet-action--favorite > span.ProfileTweet-actionCount')
if len(favorite_cnt_tag) > 0:
tweet.favorites_cnt = int(favorite_cnt_tag[0].attrs['data-tweet-stat-count'])
link_tag = tag.select('a.js-permalink')
if len(link_tag) > 0:
tweet.tweet_link = TwitterConfig.protocol + '://' + TwitterConfig.top_url + link_tag[0].attrs['href']
tweet.top_link = top_tw.tweet_link if top_tw else tweet.tweet_link
tweet.depth = depth
tweet.platform_name = 'twitter'
tweet.platform_form = 'post'
tweet.platform_title = top_tw.user_id if top_tw else tweet.user_id
tweet.article_form = 'body' if tweet.depth is 0 else 'reply'
# tweet.article_parent = None
tweet.article_id = tweet.user_id
tweet.article_nickname = tweet.user_name
# tweet.article_title = None
tweet.article_data = tweet.text
tweet.article_url = tweet.top_link
# tweet.article_hit = 0
tweet.article_date = tweet.created_at
tweet.article_order = tweet.depth
# tweet.article_profile = tweet.user_name
tweet.article_profileurl = TwitterConfig.protocol + '://' + TwitterConfig.top_url + '/' + tweet.user_id
tweet.platform_id = top_tw.user_id if top_tw else tweet.user_id
tweet.keyword_id = keyword_id
tweet.reply_url = tweet.tweet_link
# tweet.etc = ''
return tweet
@staticmethod
def get_lone_container(soup, parent_tw):
lone_tweets = soup.select('div.ThreadedConversation--loneTweet')
container_tags = []
for tag in reversed(lone_tweets):
li = tag.select('li.stream-item')
if len(li) > 0 and 'data-item-id' in li[0].attrs:
tweet_id = int(li[0].attrs['data-item-id'])
if tweet_id == parent_tw.tweet_id:
break
container_tags.append(tag)
return reversed(container_tags)

View File

@@ -11,6 +11,8 @@ from kakao import kakaocrawl
from naver import navercrawl from naver import navercrawl
from facebook import facebookcrawl from facebook import facebookcrawl
from facebook import facebookcrawlbs from facebook import facebookcrawlbs
from twitter import twittercrawl
from youtube import youtubecrawl
from base.baseclasses import print_and_flush from base.baseclasses import print_and_flush
@@ -26,8 +28,12 @@ class WebBasedCrawler:
self.crawler = kakaocrawl.KakaoMainCrawler() self.crawler = kakaocrawl.KakaoMainCrawler()
elif platform == "navercafe": elif platform == "navercafe":
self.crawler = navercrawl.NaverCafeMainAreaCrawler() self.crawler = navercrawl.NaverCafeMainAreaCrawler()
elif platform == "facebook": elif platform == 'facebook':
self.crawler = facebookcrawlbs.FacebookMainCrawler() self.crawler = facebookcrawlbs.FacebookMainCrawler()
elif platform == 'twitter':
self.crawler = twittercrawl.TwitterCrawler()
elif platform == 'youtube':
self.crawler = youtubecrawl.YoutubeMainCrawler()
else: else:
self.crawler = None self.crawler = None
raise Exception raise Exception
@@ -38,7 +44,7 @@ class WebBasedCrawler:
browser_opt = ('chrome', "ie", "opera", "firefox") browser_opt = ('chrome', "ie", "opera", "firefox")
platform_opt = ('instagram', 'kakaostory', 'navercafe', "facebook") platform_opt = ('instagram', 'kakaostory', 'navercafe', 'facebook', 'twitter', 'youtube')
def get_browser_info(platform_, file_name="browser.txt"): def get_browser_info(platform_, file_name="browser.txt"):
@@ -73,7 +79,7 @@ def get_browser_info(platform_, file_name="browser.txt"):
if __name__ == '__main__': if __name__ == '__main__':
""" """
sys.argv[0] webbasedcrawler.py sys.argv[0] webbasedcrawler.py
sys.argv[1] instagram, kakaochannel, navercafe, facebook sys.argv[1] instagram, kakaochannel, navercafe, facebook, twitter, youtube
sys.argv[2] keyword_id sys.argv[2] keyword_id
sys.argv[3] data group sys.argv[3] data group
sys.argv[4] start_day sys.argv[4] start_day
@@ -85,8 +91,7 @@ if __name__ == '__main__':
else: else:
print_and_flush("Check Argumenets!") print_and_flush("Check Argumenets!")
exit(1) exit(1)
crawler = WebBasedCrawler(get_browser_info(sys.argv[1]), sys.argv[1], sys.argv[2], crawler = WebBasedCrawler(get_browser_info(sys.argv[1]), sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5])
sys.argv[3], sys.argv[4], sys.argv[5])
crawler.start() crawler.start()
print_and_flush("Finished Crawling :)") print_and_flush("Finished Crawling :)")
exit(0) exit(0)

View File

View File

@@ -0,0 +1,7 @@
class YoutubeMainCrawl:
def __init__(self):
pass
def start(self):
pass