diff --git a/CrawlerList/CrawlerList.pro b/CrawlerList/CrawlerList.pro index 2a4656c..c1200be 100644 --- a/CrawlerList/CrawlerList.pro +++ b/CrawlerList/CrawlerList.pro @@ -28,7 +28,11 @@ SOURCES += main.cpp\ skakaousermanage.cpp \ sfacebooktagmanage.cpp \ sfacebookusermanage.cpp \ - snaverblogaccuracymanager.cpp + snaverblogaccuracymanager.cpp \ + stwittertagmanage.cpp \ + stwitterusermanage.cpp \ + syoutubetagmanage.cpp \ + syoutubeusermanage.cpp HEADERS += widget.h \ smanage.h \ @@ -45,5 +49,9 @@ HEADERS += widget.h \ skakaousermanage.h \ sfacebooktagmanage.h \ sfacebookusermanage.h \ - snaverblogaccuracymanage.h + snaverblogaccuracymanage.h \ + stwittertagmanage.h \ + stwitterusermanage.h \ + syoutubetagmanage.h \ + syoutubeusermanage.h diff --git a/CrawlerList/stwittertagmanage.cpp b/CrawlerList/stwittertagmanage.cpp new file mode 100644 index 0000000..0074828 --- /dev/null +++ b/CrawlerList/stwittertagmanage.cpp @@ -0,0 +1,89 @@ +#include "stwittertagmanage.h" +#include +#include "widget.h" +STwitterTagManage::STwitterTagManage(QObject *pObject) : SManage(pObject) +{ + m_nID = 0; + connect(&m_pro[0], SIGNAL(readyReadStandardOutput()), this, SLOT(readStandardOutput())); + connect(&m_pro[0], SIGNAL(readyReadStandardError()), this, SLOT(readStandardError())); +} + + +void STwitterTagManage::Start() +{ + m_nMode = E_PROCESS_RUN; + m_bFinalLast = false; +} + +bool STwitterTagManage::Update() +{ + if(m_bFinalLast) return m_bFinalLast; + switch(m_nMode) + { + case E_PROCESS_RUN: + if(UseProcess() == false) + { +#if defined(Q_OS_WIN32) + m_pro[0].start("python", QStringList() << "webbasedcrawler.py" << "twitter" << m_strKeywordID << m_strGroupID << m_pMain->StartDay() << m_pMain->UntilPage()); +#else + m_pro[0].start("/usr/bin/python3", QStringList() << "webbasedcrawler.py" << "twitter" << m_strKeywordID << m_strGroupID << m_pMain->StartDay() << m_pMain->UntilPage()); +#endif + m_nMode = E_PROCESS_FINISH_WAIT; + } + break; + + case E_PROCESS_FINISH_WAIT: + break; + } + + return m_bFinalLast; +} + +void STwitterTagManage::processFinished(QProcess *pPro, QString _strOut) +{ + switch(m_nMode) + { + case E_PROCESS_FINISH_WAIT: + m_nMode = E_PROCESS_RUN; + m_bFinalLast = true; + m_pMain->InsertLog("Finish Crawling :)"); + m_pMain->SetCrawlingState("Finish"); + m_ncList=1; + m_bLast = false; + break; + } +} + +void STwitterTagManage::readStandardOutput() +{ + QProcess *pPro = (QProcess*)sender(); + QThread::msleep(100); + QString str = pPro->readAllStandardOutput(); + QStringList list = str.split("\n", QString::SkipEmptyParts); + foreach(QString log,list) + { + if (m_pMain) + { + m_pMain->InsertLog(log); + } + else + exit(0); + } +} + +void STwitterTagManage::readStandardError() +{ + QProcess *pPro = (QProcess*)sender(); + QThread::msleep(100); + QString str = pPro->readAllStandardError(); + QStringList list = str.split("\n", QString::SkipEmptyParts); + foreach(QString log,list) + { + if (m_pMain) + { + m_pMain->InsertLog(log); + } + else + exit(0); + } +} diff --git a/CrawlerList/stwittertagmanage.h b/CrawlerList/stwittertagmanage.h new file mode 100644 index 0000000..2680b35 --- /dev/null +++ b/CrawlerList/stwittertagmanage.h @@ -0,0 +1,30 @@ +#ifndef STWITTERTAGMANAGE_H +#define STWITTERTAGMANAGE_H +#include "smanage.h" + +class STwitterTagManage : public SManage +{ + Q_OBJECT +public: + enum E_PROCESS_STATE + { + E_PROCESS_RUN = 0, + E_PROCESS_FINISH_WAIT, + }; + STwitterTagManage(QObject *pObject); +private: + QString makeGetListQuery(QString _str,QDate _date,int _nPage); +private: + QString m_strListQuery; + QVector m_strListURL; +protected: + bool Update(); + void Start(); + void processFinished(QProcess *pPro,QString _strOut); + void ReLoadList(); +private slots: + void readStandardOutput(); + void readStandardError(); +}; +#endif // STWITTERTAGMANAGE_H + diff --git a/CrawlerList/stwitterusermanage.cpp b/CrawlerList/stwitterusermanage.cpp new file mode 100644 index 0000000..28f751b --- /dev/null +++ b/CrawlerList/stwitterusermanage.cpp @@ -0,0 +1,89 @@ +#include "stwitterusermanage.h" +#include +#include "widget.h" +STwitterUserManage::STwitterUserManage(QObject *pObject) : SManage(pObject) +{ + m_nID = 0; + connect(&m_pro[0], SIGNAL(readyReadStandardOutput()), this, SLOT(readStandardOutput())); + connect(&m_pro[0], SIGNAL(readyReadStandardError()), this, SLOT(readStandardError())); +} + + +void STwitterUserManage::Start() +{ + m_nMode = E_PROCESS_RUN; + m_bFinalLast = false; +} + +bool STwitterUserManage::Update() +{ + if(m_bFinalLast) return m_bFinalLast; + switch(m_nMode) + { + case E_PROCESS_RUN: + if(UseProcess() == false) + { +#if defined(Q_OS_WIN32) + m_pro[0].start("python", QStringList() << "webbasedcrawler.py" << "twitter" << m_strKeywordID << m_strGroupID << m_pMain->StartDay() << m_pMain->UntilPage()); +#else + m_pro[0].start("/usr/bin/python3", QStringList() << "webbasedcrawler.py" << "twitter" << m_strKeywordID << m_strGroupID << m_pMain->StartDay() << m_pMain->UntilPage()); +#endif + m_nMode = E_PROCESS_FINISH_WAIT; + } + break; + + case E_PROCESS_FINISH_WAIT: + break; + } + + return m_bFinalLast; +} + +void STwitterUserManage::processFinished(QProcess *pPro, QString _strOut) +{ + switch(m_nMode) + { + case E_PROCESS_FINISH_WAIT: + m_nMode = E_PROCESS_RUN; + m_bFinalLast = true; + m_pMain->InsertLog("Finish Crawling :)"); + m_pMain->SetCrawlingState("Finish"); + m_ncList=1; + m_bLast = false; + break; + } +} + +void STwitterUserManage::readStandardOutput() +{ + QProcess *pPro = (QProcess*)sender(); + QThread::msleep(100); + QString str = pPro->readAllStandardOutput(); + QStringList list = str.split("\n", QString::SkipEmptyParts); + foreach(QString log,list) + { + if (m_pMain) + { + m_pMain->InsertLog(log); + } + else + exit(0); + } +} + +void STwitterUserManage::readStandardError() +{ + QProcess *pPro = (QProcess*)sender(); + QThread::msleep(100); + QString str = pPro->readAllStandardError(); + QStringList list = str.split("\n", QString::SkipEmptyParts); + foreach(QString log,list) + { + if (m_pMain) + { + m_pMain->InsertLog(log); + } + else + exit(0); + } +} diff --git a/CrawlerList/stwitterusermanage.h b/CrawlerList/stwitterusermanage.h new file mode 100644 index 0000000..5c370ec --- /dev/null +++ b/CrawlerList/stwitterusermanage.h @@ -0,0 +1,30 @@ +#ifndef STWITTERUSERMANAGE_H +#define STWITTERUSERMANAGE_H +#include "smanage.h" + +class STwitterUserManage : public SManage +{ + Q_OBJECT +public: + enum E_PROCESS_STATE + { + E_PROCESS_RUN = 0, + E_PROCESS_FINISH_WAIT, + }; + STwitterUserManage(QObject *pObject); +private: + QString makeGetListQuery(QString _str,QDate _date,int _nPage); +private: + QString m_strListQuery; + QVector m_strListURL; +protected: + bool Update(); + void Start(); + void processFinished(QProcess *pPro,QString _strOut); + void ReLoadList(); +private slots: + void readStandardOutput(); + void readStandardError(); +}; +#endif // STWITTERUSERMANAGE_H + diff --git a/CrawlerList/syoutubetagmanage.cpp b/CrawlerList/syoutubetagmanage.cpp new file mode 100644 index 0000000..3845df4 --- /dev/null +++ b/CrawlerList/syoutubetagmanage.cpp @@ -0,0 +1,89 @@ +#include "syoutubetagmanage.h" +#include +#include "widget.h" +SYoutubeTagManage::SYoutubeTagManage(QObject *pObject) : SManage(pObject) +{ + m_nID = 0; + connect(&m_pro[0], SIGNAL(readyReadStandardOutput()), this, SLOT(readStandardOutput())); + connect(&m_pro[0], SIGNAL(readyReadStandardError()), this, SLOT(readStandardError())); +} + + +void SYoutubeTagManage::Start() +{ + m_nMode = E_PROCESS_RUN; + m_bFinalLast = false; +} + +bool SYoutubeTagManage::Update() +{ + if(m_bFinalLast) return m_bFinalLast; + switch(m_nMode) + { + case E_PROCESS_RUN: + if(UseProcess() == false) + { +#if defined(Q_OS_WIN32) + m_pro[0].start("python", QStringList() << "webbasedcrawler.py" << "youtube" << m_strKeywordID << m_strGroupID << m_pMain->StartDay() << m_pMain->UntilPage()); +#else + m_pro[0].start("/usr/bin/python3", QStringList() << "webbasedcrawler.py" << "youtube" << m_strKeywordID << m_strGroupID << m_pMain->StartDay() << m_pMain->UntilPage()); +#endif + m_nMode = E_PROCESS_FINISH_WAIT; + } + break; + + case E_PROCESS_FINISH_WAIT: + break; + } + + return m_bFinalLast; +} + +void SYoutubeTagManage::processFinished(QProcess *pPro, QString _strOut) +{ + switch(m_nMode) + { + case E_PROCESS_FINISH_WAIT: + m_nMode = E_PROCESS_RUN; + m_bFinalLast = true; + m_pMain->InsertLog("Finish Crawling :)"); + m_pMain->SetCrawlingState("Finish"); + m_ncList=1; + m_bLast = false; + break; + } +} + +void SYoutubeTagManage::readStandardOutput() +{ + QProcess *pPro = (QProcess*)sender(); + QThread::msleep(100); + QString str = pPro->readAllStandardOutput(); + QStringList list = str.split("\n", QString::SkipEmptyParts); + foreach(QString log,list) + { + if (m_pMain) + { + m_pMain->InsertLog(log); + } + else + exit(0); + } +} + +void SYoutubeTagManage::readStandardError() +{ + QProcess *pPro = (QProcess*)sender(); + QThread::msleep(100); + QString str = pPro->readAllStandardError(); + QStringList list = str.split("\n", QString::SkipEmptyParts); + foreach(QString log,list) + { + if (m_pMain) + { + m_pMain->InsertLog(log); + } + else + exit(0); + } +} diff --git a/CrawlerList/syoutubetagmanage.h b/CrawlerList/syoutubetagmanage.h new file mode 100644 index 0000000..a8f49b2 --- /dev/null +++ b/CrawlerList/syoutubetagmanage.h @@ -0,0 +1,30 @@ +#ifndef STYOUTUBETAGMANAGE_H +#define STYOUTUBETAGMANAGE_H +#include "smanage.h" + +class SYoutubeTagManage : public SManage +{ + Q_OBJECT +public: + enum E_PROCESS_STATE + { + E_PROCESS_RUN = 0, + E_PROCESS_FINISH_WAIT, + }; + SYoutubeTagManage(QObject *pObject); +private: + QString makeGetListQuery(QString _str,QDate _date,int _nPage); +private: + QString m_strListQuery; + QVector m_strListURL; +protected: + bool Update(); + void Start(); + void processFinished(QProcess *pPro,QString _strOut); + void ReLoadList(); +private slots: + void readStandardOutput(); + void readStandardError(); +}; +#endif // STYOUTUBETAGMANAGE_H + diff --git a/CrawlerList/syoutubeusermanage.cpp b/CrawlerList/syoutubeusermanage.cpp new file mode 100644 index 0000000..a53f244 --- /dev/null +++ b/CrawlerList/syoutubeusermanage.cpp @@ -0,0 +1,89 @@ +#include "syoutubeusermanage.h" +#include +#include "widget.h" +SYoutubeUserManage::SYoutubeUserManage(QObject *pObject) : SManage(pObject) +{ + m_nID = 0; + connect(&m_pro[0], SIGNAL(readyReadStandardOutput()), this, SLOT(readStandardOutput())); + connect(&m_pro[0], SIGNAL(readyReadStandardError()), this, SLOT(readStandardError())); +} + + +void SYoutubeUserManage::Start() +{ + m_nMode = E_PROCESS_RUN; + m_bFinalLast = false; +} + +bool SYoutubeUserManage::Update() +{ + if(m_bFinalLast) return m_bFinalLast; + switch(m_nMode) + { + case E_PROCESS_RUN: + if(UseProcess() == false) + { +#if defined(Q_OS_WIN32) + m_pro[0].start("python", QStringList() << "webbasedcrawler.py" << "youtube" << m_strKeywordID << m_strGroupID << m_pMain->StartDay() << m_pMain->UntilPage()); +#else + m_pro[0].start("/usr/bin/python3", QStringList() << "webbasedcrawler.py" << "youtube" << m_strKeywordID << m_strGroupID << m_pMain->StartDay() << m_pMain->UntilPage()); +#endif + m_nMode = E_PROCESS_FINISH_WAIT; + } + break; + + case E_PROCESS_FINISH_WAIT: + break; + } + + return m_bFinalLast; +} + +void SYoutubeUserManage::processFinished(QProcess *pPro, QString _strOut) +{ + switch(m_nMode) + { + case E_PROCESS_FINISH_WAIT: + m_nMode = E_PROCESS_RUN; + m_bFinalLast = true; + m_pMain->InsertLog("Finish Crawling :)"); + m_pMain->SetCrawlingState("Finish"); + m_ncList=1; + m_bLast = false; + break; + } +} + +void SYoutubeUserManage::readStandardOutput() +{ + QProcess *pPro = (QProcess*)sender(); + QThread::msleep(100); + QString str = pPro->readAllStandardOutput(); + QStringList list = str.split("\n", QString::SkipEmptyParts); + foreach(QString log,list) + { + if (m_pMain) + { + m_pMain->InsertLog(log); + } + else + exit(0); + } +} + +void SYoutubeUserManage::readStandardError() +{ + QProcess *pPro = (QProcess*)sender(); + QThread::msleep(100); + QString str = pPro->readAllStandardError(); + QStringList list = str.split("\n", QString::SkipEmptyParts); + foreach(QString log,list) + { + if (m_pMain) + { + m_pMain->InsertLog(log); + } + else + exit(0); + } +} diff --git a/CrawlerList/syoutubeusermanage.h b/CrawlerList/syoutubeusermanage.h new file mode 100644 index 0000000..d3e461f --- /dev/null +++ b/CrawlerList/syoutubeusermanage.h @@ -0,0 +1,30 @@ +#ifndef YOUTUBE_USER_MANAGE_H +#define YOUTUBE_USER_MANAGE_H +#include "smanage.h" + +class SYoutubeUserManage : public SManage +{ + Q_OBJECT +public: + enum E_PROCESS_STATE + { + E_PROCESS_RUN = 0, + E_PROCESS_FINISH_WAIT, + }; + SYoutubeUserManage(QObject *pObject); +private: + QString makeGetListQuery(QString _str,QDate _date,int _nPage); +private: + QString m_strListQuery; + QVector m_strListURL; +protected: + bool Update(); + void Start(); + void processFinished(QProcess *pPro,QString _strOut); + void ReLoadList(); +private slots: + void readStandardOutput(); + void readStandardError(); +}; +#endif // YOUTUBE_USER_MANAGE_H + diff --git a/CrawlerList/widget.cpp b/CrawlerList/widget.cpp index 634f28a..7706626 100644 --- a/CrawlerList/widget.cpp +++ b/CrawlerList/widget.cpp @@ -22,6 +22,10 @@ #include "sfacebooktagmanage.h" #include "sfacebookusermanage.h" #include "snaverblogaccuracymanage.h" +#include "stwittertagmanage.h" +#include "stwitterusermanage.h" +#include "syoutubetagmanage.h" +#include "syoutubeusermanage.h" #include #include @@ -51,6 +55,10 @@ Widget::Widget(QWidget *parent) : QWidget(parent) , m_nMode(E_MODE_WAIT) m_pFacebookTag = new SFacebookTagManage(this); m_pFacebookUser = new SFacebookUserManage(this); m_pNaverBlogAccuracy = new SNaverBlogAccuracyManage(this); + m_pTwitterTag = new STwitterTagManage(this); + m_pTwitterUser = new STwitterUserManage(this); + m_pYoutubeTag = new SYoutubeTagManage(this); + m_pYoutubeUser = new SYoutubeUserManage(this); m_pManage[0] = m_pNaverCafe; m_pManage[1] = m_pNaverBlog; @@ -66,6 +74,10 @@ Widget::Widget(QWidget *parent) : QWidget(parent) , m_nMode(E_MODE_WAIT) m_pManage[11] = m_pFacebookTag; m_pManage[12] = m_pFacebookUser; m_pManage[13] = m_pNaverBlogAccuracy; + m_pManage[14] = m_pTwitterTag; + m_pManage[15] = m_pTwitterUser; + m_pManage[16] = m_pYoutubeTag; + m_pManage[17] = m_pYoutubeUser; m_db = QSqlDatabase::addDatabase("QMYSQL"); m_db.setHostName("bigbird.iptime.org"); diff --git a/CrawlerList/widget.h b/CrawlerList/widget.h index 048d5a5..c45b3f7 100644 --- a/CrawlerList/widget.h +++ b/CrawlerList/widget.h @@ -26,6 +26,10 @@ class SInstaUserManage; class SFacebookTagManage; class SFacebookUserManage; class SNaverBlogAccuracyManage; +class STwitterTagManage; +class STwitterUserManage; +class SYoutubeTagManage; +class SYoutubeUserManage; #define SAFE_DELETE(p) {if(p) delete (p); (p) = NULL; } @@ -55,7 +59,7 @@ private: QLineEdit *m_pedStartDay; QTimer m_timer,m_timerAlive; QSqlDatabase m_db; - static const int C_PLATFORM_MAX = 14; + static const int C_PLATFORM_MAX = 18; SManage *m_pManage[C_PLATFORM_MAX]; QListWidget *m_pResultList; QString m_strFileName; @@ -75,6 +79,11 @@ private: SFacebookTagManage *m_pFacebookTag; SFacebookUserManage *m_pFacebookUser; SNaverBlogAccuracyManage *m_pNaverBlogAccuracy; + STwitterTagManage* m_pTwitterTag; + STwitterUserManage* m_pTwitterUser; + SYoutubeTagManage* m_pYoutubeTag; + SYoutubeUserManage* m_pYoutubeUser; + int m_nStartTime,m_nRangeTime,m_nPlatform; //QGroupBox *m_pgbManual; QCheckBox *m_pcheckboxReal; diff --git a/CrawlerProcess/main.cpp b/CrawlerProcess/main.cpp index a3bff35..38d66fa 100644 --- a/CrawlerProcess/main.cpp +++ b/CrawlerProcess/main.cpp @@ -1,4 +1,4 @@ -#include "scrawler.h" +#include "scrawler.h" #include #include diff --git a/CrawlerProcess/scrawler.cpp b/CrawlerProcess/scrawler.cpp index 5f63155..00bed0b 100644 --- a/CrawlerProcess/scrawler.cpp +++ b/CrawlerProcess/scrawler.cpp @@ -1,4 +1,4 @@ -#include "scrawler.h" +#include "scrawler.h" #include #include #include diff --git a/CrawlerProcess/scrawler.h b/CrawlerProcess/scrawler.h index 4b1e26a..d433569 100644 --- a/CrawlerProcess/scrawler.h +++ b/CrawlerProcess/scrawler.h @@ -1,4 +1,4 @@ -#ifndef SCRAWLER_H +#ifndef SCRAWLER_H #define SCRAWLER_H #include diff --git a/CrawlerProcess/scrawlerdata.h b/CrawlerProcess/scrawlerdata.h index e338123..4b9b2f7 100644 --- a/CrawlerProcess/scrawlerdata.h +++ b/CrawlerProcess/scrawlerdata.h @@ -1,4 +1,4 @@ -#ifndef SCRAWLERDATA +#ifndef SCRAWLERDATA #define SCRAWLERDATA #endif // SCRAWLERDATA diff --git a/GroupManager/widget.cpp b/GroupManager/widget.cpp index 15add69..ee770c2 100644 --- a/GroupManager/widget.cpp +++ b/GroupManager/widget.cpp @@ -85,6 +85,10 @@ Widget::Widget(QWidget *parent) "WHEN 11 THEN 'Facebook Tag' " "WHEN 12 THEN 'Facebook User' " "WHEN 13 THEN 'Naver Blog Accuracy' " + "WHEN 14 THEN 'Twitter Tag' " + "WHEN 15 THEN 'Twitter User' " + "WHEN 16 THEN 'Youtube Tag' " + "WHEN 17 THEN 'Youtube User' " "ELSE 'UnKnown'" "END AS platform FROM keyword where state is null"); m_pmodelGroup->setQuery("SELECT * FROM datagroup"); @@ -140,7 +144,7 @@ QGroupBox *Widget::setKeywordWidgets() m_pcbPlatform = new QComboBox; m_pcbPlatform->addItems(QStringList() << "Naver Cafe" << "Naver Blog" << "Daum Cafe" << "Naver News" << "Naver Cafe List" << "Daum Cafe List" << "Kakao Story Channel" << "Kakao Story Tag" << "Kakao Story User" << "Instagram Tag" << "Instagram User" - << "Facebook Tag" << "Facebook User" << "Naver Blog Accuracy"); + << "Facebook Tag" << "Facebook User" << "Naver Blog Accuracy" << "Twitter Tag" << "Twitter User" << "Youtube Tag" << "Youtube User"); m_pleKeyword = new QLineEdit; m_pleAuthorship = new QLineEdit; @@ -380,6 +384,10 @@ void Widget::on_keyword_currentRowChanged(QModelIndex _index) if (str == QString("Facebook Tag")) nSelect = 11; if (str == QString("Facebook User")) nSelect = 12; if (str == QString("Naver Blog Accuracy")) nSelect = 13; + if (str == QString("Twitter Tag")) nSelect = 14; + if (str == QString("Twitter User")) nSelect = 15; + if (str == QString("Youtube Tag")) nSelect = 16; + if (str == QString("Youtube User")) nSelect = 17; m_pcbPlatform->setCurrentIndex(nSelect); } } @@ -504,6 +512,10 @@ void Widget::on_keyword_button_insert() "WHEN 11 THEN 'Facebook Tag' " "WHEN 12 THEN 'Facebook User' " "WHEN 13 THEN 'Naver Blog Accuracy' " + "WHEN 14 THEN 'Twitter Tag' " + "WHEN 15 THEN 'Twitter User' " + "WHEN 16 THEN 'Youtube Tag' " + "WHEN 17 THEN 'Youtube User' " "ELSE 'UnKnown'" "END AS platform FROM keyword where state is null"); } @@ -535,6 +547,10 @@ void Widget::on_keyword_button_delete() "WHEN 11 THEN 'Facebook Tag' " "WHEN 12 THEN 'Facebook User' " "WHEN 13 THEN 'Naver Blog Accuracy' " + "WHEN 14 THEN 'Twitter Tag' " + "WHEN 15 THEN 'Twitter User' " + "WHEN 16 THEN 'Youtube Tag' " + "WHEN 17 THEN 'Youtube User' " "ELSE 'UnKnown'" "END AS platform FROM keyword where state is null"); } @@ -576,6 +592,10 @@ void Widget::on_keyword_button_modify() "WHEN 11 THEN 'Facebook Tag' " "WHEN 12 THEN 'Facebook User' " "WHEN 13 THEN 'Naver Blog Accuracy' " + "WHEN 14 THEN 'Twitter Tag' " + "WHEN 15 THEN 'Twitter User' " + "WHEN 16 THEN 'Youtube Tag' " + "WHEN 17 THEN 'Youtube User' " "ELSE 'UnKnown'" "END AS platform FROM keyword where state is null"); } @@ -1100,7 +1120,14 @@ void Widget::on_group_button_copy_start() void Widget::UpdateCrawling() { m_pmodelCrawling->setQuery("SELECT _crawling.id,_keyword.realtime,_keyword.searches,_keyword.start,_keyword.end, _datagroup.name , " - "(CASE _keyword.platform WHEN 0 THEN 'Naver Cafe' WHEN 1 THEN 'Naver Blog' WHEN 2 THEN 'Daum Cafe' WHEN 3 THEN 'Naver News' WHEN 4 THEN 'Naver Cafe List' WHEN 5 THEN 'Daum Cafe List' WHEN 6 THEN 'Kakao Story Channel' " + "(CASE _keyword.platform " + "WHEN 0 THEN 'Naver Cafe' " + "WHEN 1 THEN 'Naver Blog' " + "WHEN 2 THEN 'Daum Cafe' " + "WHEN 3 THEN 'Naver News' " + "WHEN 4 THEN 'Naver Cafe List' " + "WHEN 5 THEN 'Daum Cafe List' " + "WHEN 6 THEN 'Kakao Story Channel' " "WHEN 7 THEN 'Kakao Story Tag' " "WHEN 8 THEN 'Kakao Story User' " "WHEN 9 THEN 'Instagram Tag' " @@ -1108,6 +1135,10 @@ void Widget::UpdateCrawling() "WHEN 11 THEN 'Facebook Tag' " "WHEN 12 THEN 'Facebook User' " "WHEN 13 THEN 'Naver Blog Accuracy' " + "WHEN 14 THEN 'Twitter Tag' " + "WHEN 15 THEN 'Twitter User' " + "WHEN 16 THEN 'Youtube Tag' " + "WHEN 17 THEN 'Youtube User' " "ELSE 'UnKnown' END ) AS platform , " "(CASE _crawling.state WHEN 0 THEN 'Waiting' WHEN 1 THEN 'Running' WHEN 2 THEN 'Terminated' ELSE 'None' END ) AS state " "FROM crawling _crawling INNER JOIN keyword _keyword ON _crawling.keyword_id = _keyword.id " diff --git a/WebBasedCrawler/base/baseclasses.py b/WebBasedCrawler/base/baseclasses.py index 70fc8a8..8203cc2 100644 --- a/WebBasedCrawler/base/baseclasses.py +++ b/WebBasedCrawler/base/baseclasses.py @@ -32,6 +32,7 @@ def is_debugger_attached(): is_debug = is_debugger_attached() + def printl(*objects, sep=' ', end='\n', file=None, flush=True): if is_debug: cur_frame = inspect.currentframe() diff --git a/WebBasedCrawler/base/dbdata.py b/WebBasedCrawler/base/dbdata.py new file mode 100644 index 0000000..e36185d --- /dev/null +++ b/WebBasedCrawler/base/dbdata.py @@ -0,0 +1,79 @@ +from pymysql.connections import Connection +import datetime +from numbers import Number + +class DataDBRow: + def __init__(self): + self.platform_name = None + self.platform_form = None + self.platform_title = None + self.article_form = None + self.article_parent = None + self.article_id = None + self.article_nickname = None + self.article_title = None + self.article_data = None + self.article_url = None + self.article_hit = 0 + self.article_date = None + self.article_order = 0 + self.article_profile = None + self.article_profileurl = None + self.platform_id = None + self.keyword_id = -1 + self.reply_url = None + self.etc = None + + def get_keys(self): + inst = DataDBRow() + keys = () + for key, value_type in inst.__dict__.items(): + if key.startswith('__') or callable(value_type): + continue + + keys += key, + + return keys + + def get_values(self, conn, db_num): + inst = DataDBRow() + values = () + for key, value_type in inst.__dict__.items(): + if key.startswith('__') or callable(value_type): + continue + + value = self.__dict__[key] + if isinstance(value, Number): + values += str(value), + elif isinstance(value, str): + values += conn.escape(value.encode('utf8').decode('utf8')), + else: + values += conn.escape(value), + + return values + + def get_insert_query(self, conn, db_num): + + inst = DataDBRow() + + keys = '' + values = '' + for key, value_type in inst.__dict__.items(): + if key.startswith('__') or callable(value_type): + continue + + if len(keys) > 0: + keys += ', ' + values += ', ' + + keys += key + value = self.__dict__[key] + if isinstance(value, Number): + values += str(value) + elif isinstance(value, str): + values += conn.escape(value.encode('utf8').decode('utf8')) + else: + values += conn.escape(value) + + query = 'insert into data_{} ({}) values ({})'.format(db_num, keys, values) + return query diff --git a/WebBasedCrawler/base/proxy.py b/WebBasedCrawler/base/proxy.py index e3f5d6c..a36367c 100644 --- a/WebBasedCrawler/base/proxy.py +++ b/WebBasedCrawler/base/proxy.py @@ -97,6 +97,31 @@ def get_driver(platform, proxies): else: return platform_webdriver[platform](capabilities=desired_capabilities) +_expired_proxies = [] + + +def set_proxy_expired(proxy): + if proxy not in _expired_proxies: + _expired_proxies.append(proxy) + + address = proxy['http'][len('http://'):] + + with open(proxy_filename, 'r') as f: + lines = f.readlines() + + expired_idx = -1 + for idx, line in enumerate(lines): + if line.startswith(address): + expired_idx = idx + break + + if expired_idx >= 0: + lines[expired_idx] = '# ' + lines[expired_idx] + lines.append(lines.pop(expired_idx)) + + with open(proxy_filename, 'w') as f: + f.writelines(lines) + def get_proxy_from_file(filename): """ @@ -104,7 +129,7 @@ def get_proxy_from_file(filename): :return (ip, port): string, string if ip, port or filename is invalid, return (None, None) """ - proxy_lists = [line.replace('\n', '') for line in open(filename) if re_ip.search(line)] + proxy_lists = [line.replace('\n', '') for line in open(filename) if not line.strip().startswith('#') and re_ip.search(line)] if proxy_lists: m = re_ip.search(proxy_lists[random.randint(0, len(proxy_lists) - 1)]) if m: diff --git a/WebBasedCrawler/requirements.txt b/WebBasedCrawler/requirements.txt new file mode 100644 index 0000000..1d6caf7 --- /dev/null +++ b/WebBasedCrawler/requirements.txt @@ -0,0 +1,3 @@ +requests +bs4 +pytz diff --git a/WebBasedCrawler/twitter/__init__.py b/WebBasedCrawler/twitter/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/WebBasedCrawler/twitter/twconfig.py b/WebBasedCrawler/twitter/twconfig.py new file mode 100644 index 0000000..c7d12bf --- /dev/null +++ b/WebBasedCrawler/twitter/twconfig.py @@ -0,0 +1,62 @@ +import datetime +import copy + +class TwitterConfig: + protocol = 'https' + top_url = 'twitter.com' + search_url = '/i/search/timeline' + conversation_url_form = '/i/{}/conversation/{}' + + def __init__(self): + self.keyword_id = -1 + self.db_num = -1 + + self.id = 0 + self.realtime = False + self.keywords = [] + self.start_str = None + self.start = None + self.end_str = None + self.end = None + self.authorship = None + self.state = None + self.platform = None + + def set_param(self, keyword_id, db_num, params): + self.keyword_id = int(keyword_id) + self.db_num = int(db_num) + + self.id = int(params['id']) + self.realtime = params['realtime'] == '1' + + self.keywords = [] + for keyword in params['searches'].split(','): + self.keywords.append(keyword.strip()) + + self.start_str = str(params['start']) + self.end_str = str(params['end']) + + self.start = datetime.datetime.combine(params['start'], datetime.datetime.min.time()) + self.end = datetime.datetime.combine(params['end'], datetime.datetime.min.time()) + + self.authorship = params['authorship'] + self.state = params['state'] + self.platform = params['platform'] + + def split(self): + split_list = [] + new_end = self.end + + while new_end > self.start: + new_config = copy.deepcopy(self) + + new_config.end = new_end + new_end = new_end + datetime.timedelta(days=-1) + new_config.start = new_end + + new_config.start_str = new_config.start.strftime('%Y-%m-%d') + new_config.end_str = new_config.end.strftime('%Y-%m-%d') + + split_list.append(new_config) + + return split_list diff --git a/WebBasedCrawler/twitter/twdbhelper.py b/WebBasedCrawler/twitter/twdbhelper.py new file mode 100644 index 0000000..74d983a --- /dev/null +++ b/WebBasedCrawler/twitter/twdbhelper.py @@ -0,0 +1,79 @@ +from twitter.tweet import Tweet +import multiprocessing as mp + + +class TwitterDBHelper: + pymysql = __import__('pymysql.cursors') + + def __init__(self): + self.tweets = [] + self.buffer = [] + self.lock = mp.Lock() + pass + + def __del__(self): + pass + + def get_param(self, keyword_id): + query = "select * from keyword where id = " + str(keyword_id) + params = [] + try: + conn = self.pymysql.connect(host='bigbird.iptime.org', + user='admin', passwd='admin123', + db='concepters', charset='utf8', + cursorclass=self.pymysql.cursors.DictCursor) + + with conn.cursor() as cursor: + cursor.execute(query) + params = cursor.fetchone() + + except Exception as e: + print(e) + exit(1) + + else: + conn.close() + + return params + + def insert_tweet(self, tweet: Tweet = None, db_num: int = -1, flush=False): + + # self.lock.acquire() + # if tweet is not None: + # self.buffer.append((tweet, db_num, )) + # + # local_buffer = None + # if len(self.buffer) >= 100 or flush: + # local_buffer = copy.deepcopy(self.buffer) + # self.buffer.clear() + # self.lock.release() + + local_buffer = [(tweet, db_num, )] + if local_buffer: + while True: + try: + conn = self.pymysql.connect(host='bigbird.iptime.org', + user='admin', passwd='admin123', + db='concepters', charset='utf8', + cursorclass=self.pymysql.cursors.DictCursor, + connect_timeout=5) + + except Exception as e: + print(e) + continue + + else: + break + + try: + with conn.cursor() as cursor: + for tweet, _db_num in local_buffer: + query = tweet.get_insert_query(conn, _db_num) + cursor.execute(query) + conn.commit() + + except Exception as e: + print(e) + + finally: + conn.close() diff --git a/WebBasedCrawler/twitter/tweet.py b/WebBasedCrawler/twitter/tweet.py new file mode 100644 index 0000000..c5d0d2c --- /dev/null +++ b/WebBasedCrawler/twitter/tweet.py @@ -0,0 +1,24 @@ +from base.dbdata import DataDBRow + + +class Tweet(DataDBRow): + + def __init__(self): + super(self.__class__, self).__init__() + + self.tweet_id = None + self.user_id = None + self.user_name = None + self.text = None + self.created_at = None + self.retweets = 0 + self.favorites = 0 + + self.is_reply = False + self.reply_cnt = 0 + self.retweet_cnt = 0 + self.favorite_cnt = 0 + self.top_link = None + self.tweet_link = None + + self.depth = 0 diff --git a/WebBasedCrawler/twitter/twittercrawl.py b/WebBasedCrawler/twitter/twittercrawl.py new file mode 100644 index 0000000..c77e36c --- /dev/null +++ b/WebBasedCrawler/twitter/twittercrawl.py @@ -0,0 +1,289 @@ +from twitter.twconfig import TwitterConfig +from twitter.twdbhelper import TwitterDBHelper +from twitter.tweet import Tweet +from twitter.twparser import TweetParser + +import base.proxy +import base.baseclasses + +import requests +import bs4 +import json +import urllib +import threading +import queue +import time + + +class TwitterCrawler(): + + def __init__(self): + self.default_config = TwitterConfig() + self.db_helper = TwitterDBHelper() + + def set_arguments(self, browser, keyword_id, db_num, before_day, until_page): + params = self.db_helper.get_param(keyword_id) + self.default_config.set_param(keyword_id, db_num, params) + + @staticmethod + def get_timeline_url(query, start_str, end_str, max_position=''): + params = { + 'f': 'tweets', + 'vertical': 'default', + 'src': 'typd', + 'q': '{} since:{} until:{}'.format(query, start_str, end_str), + 'language': 'en', + 'max_position': max_position, + } + + url_tupple = (TwitterConfig.protocol, TwitterConfig.top_url, TwitterConfig.search_url, '', urllib.parse.urlencode(params), '') + return urllib.parse.urlunparse(url_tupple) + + @staticmethod + def get_content_url(user_id, tweet_id, max_position=''): + params = { + 'max_position': max_position, + } + + sub_url = TwitterConfig.conversation_url_form.format(user_id, tweet_id) + url_tupple = (TwitterConfig.protocol, TwitterConfig.top_url, sub_url, '', urllib.parse.urlencode(params), '') + return urllib.parse.urlunparse(url_tupple) + + @staticmethod + def get_page(url, proc_id): + headers = { + 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36', + 'Accept-Language': 'ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4', + } + # if proxies is None: + proxies = base.proxy.get_proxy_for_requests() + + resp = None + while True: + try: + resp = requests.get(url, headers=headers, proxies=proxies, timeout=3) + except Exception as e: + if proxies == (None, None): + break + + print('[{}] proxy {} is expired. ({})'.format(proc_id, proxies, e)) + base.proxy.set_proxy_expired(proxies) + proxies = base.proxy.get_proxy_for_requests() + else: + break + + return resp + + def runner_proc(self, proc_id, content_queue, result_queue, config): + print('{} to {} runner thread start'.format(config.start_str, config.end_str)) + + b_continue = True + min_tweet_id = None + max_tweet_id = None + max_position = '' + tweet_count = 0 + + while b_continue: + if min_tweet_id is not None: + max_position = 'TWEET-{}-{}'.format(max_tweet_id, min_tweet_id) + url = self.get_timeline_url(config.keywords[0], config.start_str, config.end_str, max_position) + resp = self.get_page(url, proc_id) + if resp is None: + break + + j = json.loads(resp.content.decode('utf-8')) + soup = bs4.BeautifulSoup(j['items_html'], 'lxml') + tweet_tags = soup.select("div.tweet") + + for tw in tweet_tags: + tweet = TweetParser.parse(tw, config.keyword_id) + + if tweet.is_reply is True: + # print(' ## {}: {}...'.format(tweet.user_name, tweet.text[:20])) + continue + + if tweet.reply_cnt > 0: + self.insert_content_pool(proc_id, content_queue, tweet, tweet) + + self.db_helper.insert_tweet(tweet, config.db_num) + + # print('{} {}: {}...'.format(tweet.created_at, tweet.user_name, tweet.text[:20])) + + count = len(tweet_tags) + if count == 0: + break + + if min_tweet_id is None: + min_tweet_id = tweet_tags[0].attrs['data-item-id'] + max_tweet_id = tweet_tags[-1].attrs['data-item-id'] + tweet_count += count + + print('{} to {} runner thread finished {}'.format(config.start_str, config.end_str, tweet_count)) + result_queue.put((proc_id, tweet_count, )) + # self.runner_processing[proc_id].value = False + return proc_id, tweet_count, + + @staticmethod + def insert_content_pool(proc_id: int, qu, tweet: Tweet, tweet_top: Tweet): + # print(' [{}] pool insert: {} ({})'.format(proc_id, tweet.text[:20] if tweet.text else '', tweet.tweet_link)) + qu.put((tweet, tweet_top,)) + + @staticmethod + def get_content(content_queue): + sleep_time = time.time() + while True: + try: + parent_tw, top_tw, = content_queue.get(block=True, timeout=2) + except Exception as e: + if time.time()-sleep_time > 60: + break + else: + continue + else: + return parent_tw, top_tw, + + return None, None, + + def content_proc(self, proc_id, content_queue, result_queue): + print('[{}] content thread start'.format(proc_id)) + + tweet_count = 0 + while True: + parent_tw, top_tw, = self.get_content(content_queue) + if not parent_tw: + break + + # print(' [{}] <<< parent : {} ({})'.format(proc_id, parent_tw.text[:20], parent_tw.tweet_link)) + + max_position = '' + + b_continue = True + while b_continue: + url = self.get_content_url(parent_tw.user_id, parent_tw.tweet_id, max_position) + resp = self.get_page(url, proc_id) + if resp is None or resp.status_code == 404: + break + elif resp.status_code != 200: + print('[WARNING] content_get code {}'.format(resp.status_code)) + continue + + j = json.loads(resp.content.decode('utf-8')) + soup = bs4.BeautifulSoup(j['items_html'], 'lxml') + + reply_container_tags = soup.select('li.ThreadedConversation') + reply_container_tags += TweetParser.get_lone_container(soup, parent_tw) + for container_tags in reply_container_tags: + tweet_tags = container_tags.select('div.tweet') + if len(tweet_tags) > 0: + tweet = TweetParser.parse(tweet_tags[0], self.default_config.keyword_id, parent_tw.depth+1, top_tw) + # print('[{}]>>> {} {}: {} ({}) ({})'.format(proc_id, tweet.created_at, tweet.user_name, tweet.text[:20], tweet.depth, tweet.tweet_link)) + self.insert_content_pool(proc_id, content_queue, tweet, top_tw) + self.db_helper.insert_tweet(tweet, self.default_config.db_num) + tweet_count += 1 + + b_continue = j['has_more_items'] + if b_continue: + max_position = j['min_position'] + + result_queue.put((proc_id, tweet_count)) + print('[{}] content thread finished'.format(proc_id)) + return proc_id, tweet_count, + + def debug_content(self): + content_qu = queue.Queue() + runner_result_qu = queue.Queue() + content_result_qu = queue.Queue() + + test_tw = Tweet() + # test_tw.tweet_link = 'https://twitter.com/yniold_/status/886863893137678337' + # test_tw.user_id = 'yniold_' + # test_tw.tweet_id = 886863893137678337 + + test_tw.tweet_link = 'https://twitter.com/Awesome_vely/status/888704413111435264' + test_tw.user_id = 'Awesome_vely' + test_tw.tweet_id = 888704413111435264 + + test_tw.text = '시작' + self.insert_content_pool(0, content_qu, test_tw, test_tw) + + content_threads = [threading.Thread(target=self.content_proc, args=(proc_id, content_qu, content_result_qu)) for proc_id in range(16)] + [th.start() for th in content_threads] + [th.join() for th in content_threads] + + while not content_result_qu.empty(): + res = content_result_qu.get() + print('reply : {}'.format(res)) + + print('end all') + + def test_insert_db(self): + test_tw = Tweet() + test_tw.tweet_link = 'https://twitter.com/moonriver365/status/885797401033818112' + test_tw.user_id = 'moonriver365' + test_tw.tweet_id = 885797401033818112 + for _ in range(5): + self.db_helper.insert_tweet(test_tw, self.default_config.db_num) + + def debug(self): + if base.baseclasses.is_debug: + ## check proxy + # base.proxy.get_proxy_from_file('proxy.txt') + # proxy = {'https': 'http://45.56.86.93:3128', 'http': 'http://45.56.86.93:3128'} + # base.proxy.set_proxy_expired(proxy) + # return + + ## contents check + self.debug_content() + + # split_config = self.default_config.split() + + # self.test_insert_db() + + print("debug end") + # exit() + + def start(self): + start_time = time.time() + + # self.debug() + # return + + # run + split_config = self.default_config.split() + content_qu = queue.Queue() + runner_result_qu = queue.Queue() + content_result_qu = queue.Queue() + + runner_threads = [threading.Thread(target=self.runner_proc, args=(proc_id, content_qu, runner_result_qu, config)) for proc_id, config in enumerate(split_config)] + content_threads = [threading.Thread(target=self.content_proc, args=(proc_id, content_qu, content_result_qu)) for proc_id in range(16)] + + [th.start() for th in runner_threads] + [th.start() for th in content_threads] + + [th.join() for th in runner_threads] + [th.join() for th in content_threads] + + # rerun zero runners + runner_threads = [] + runner_result_qu2 = queue.Queue() + idx = 0 + while not runner_result_qu.empty(): + res = runner_result_qu.get() + if res == 0: + th = threading.Thread(target=self.runner_proc, args=(idx, content_qu, runner_result_qu2, split_config[idx])) + runner_threads.append(th) + + idx += 1 + content_threads = [threading.Thread(target=self.content_proc, args=(proc_id, content_qu, content_result_qu)) for proc_id in range(16)] + + [th.start() for th in runner_threads] + [th.start() for th in content_threads] + + [th.join() for th in runner_threads] + [th.join() for th in content_threads] + + # print running time + delta = time.time() - start_time + m, s = divmod(delta, 60) + h, m = divmod(m, 60) + print("finished all {}:{:02d}:{:02d} ".format(int(h), int(m), int(s))) diff --git a/WebBasedCrawler/twitter/twparser.py b/WebBasedCrawler/twitter/twparser.py new file mode 100644 index 0000000..257f964 --- /dev/null +++ b/WebBasedCrawler/twitter/twparser.py @@ -0,0 +1,96 @@ +from twitter.tweet import Tweet +from twitter.twconfig import TwitterConfig + +import bs4 +import datetime +import pytz + +class TweetParser: + + @staticmethod + def parse(tag, keyword_id, depth=0, top_tw: Tweet=None): + tweet = Tweet() + + tweet.tweet_id = int(tag.attrs['data-tweet-id']) + + nickname_tag = tag.select('strong.fullname')[0] + tweet.user_name = '' + for child in nickname_tag.children: + if isinstance(child, bs4.element.NavigableString): + if len(tweet.user_name) > 0: + tweet.user_name += ' ' + tweet.user_name += child + tweet.user_id = tag.select('span.username')[0].text[1:] + tweet.text = tag.select('p.tweet-text')[0].text + + # time_str = tag.select('a.tweet-timestamp')[0].attrs['title'] + # english + # tweet.created_at = datetime.datetime.strptime(time_str, '%I:%M %p - %d %b %Y') + # korean + # time_str = time_str.replace('오전', 'AM').replace('오후', 'PM') + # tweet.created_at = datetime.datetime.strptime(time_str, '%p %I:%M - %Y년 %m월 %d일') + + timestamp = int(tag.select('span._timestamp')[0].attrs['data-time']) + utc_dt = datetime.datetime.utcfromtimestamp(timestamp) + local_tz = pytz.timezone('Asia/Seoul') + local_dt = utc_dt.replace(tzinfo=pytz.utc).astimezone(local_tz) + tweet.created_at = local_tz.normalize(local_dt) + + reply_tag = tag.select('div.ReplyingToContextBelowAuthor') + tweet.is_reply = len(reply_tag) > 0 + + reply_cnt_tag = tag.select('span.ProfileTweet-action--reply > span.ProfileTweet-actionCount') + if len(reply_cnt_tag) > 0: + tweet.reply_cnt = int(reply_cnt_tag[0].attrs['data-tweet-stat-count']) + + retweet_cnt_tag = tag.select('span.ProfileTweet-action--retweet > span.ProfileTweet-actionCount') + if len(retweet_cnt_tag) > 0: + tweet.retweet_cnt = int(retweet_cnt_tag[0].attrs['data-tweet-stat-count']) + + favorite_cnt_tag = tag.select('span.ProfileTweet-action--favorite > span.ProfileTweet-actionCount') + if len(favorite_cnt_tag) > 0: + tweet.favorites_cnt = int(favorite_cnt_tag[0].attrs['data-tweet-stat-count']) + + link_tag = tag.select('a.js-permalink') + if len(link_tag) > 0: + tweet.tweet_link = TwitterConfig.protocol + '://' + TwitterConfig.top_url + link_tag[0].attrs['href'] + tweet.top_link = top_tw.tweet_link if top_tw else tweet.tweet_link + + tweet.depth = depth + + tweet.platform_name = 'twitter' + tweet.platform_form = 'post' + tweet.platform_title = top_tw.user_id if top_tw else tweet.user_id + tweet.article_form = 'body' if tweet.depth is 0 else 'reply' + # tweet.article_parent = None + tweet.article_id = tweet.user_id + tweet.article_nickname = tweet.user_name + # tweet.article_title = None + tweet.article_data = tweet.text + tweet.article_url = tweet.top_link + # tweet.article_hit = 0 + tweet.article_date = tweet.created_at + tweet.article_order = tweet.depth + # tweet.article_profile = tweet.user_name + tweet.article_profileurl = TwitterConfig.protocol + '://' + TwitterConfig.top_url + '/' + tweet.user_id + tweet.platform_id = top_tw.user_id if top_tw else tweet.user_id + tweet.keyword_id = keyword_id + tweet.reply_url = tweet.tweet_link + # tweet.etc = '' + + return tweet + + @staticmethod + def get_lone_container(soup, parent_tw): + lone_tweets = soup.select('div.ThreadedConversation--loneTweet') + container_tags = [] + for tag in reversed(lone_tweets): + li = tag.select('li.stream-item') + if len(li) > 0 and 'data-item-id' in li[0].attrs: + tweet_id = int(li[0].attrs['data-item-id']) + if tweet_id == parent_tw.tweet_id: + break + + container_tags.append(tag) + + return reversed(container_tags) diff --git a/WebBasedCrawler/webbasedcrawler.py b/WebBasedCrawler/webbasedcrawler.py index 9f03e71..44a0853 100644 --- a/WebBasedCrawler/webbasedcrawler.py +++ b/WebBasedCrawler/webbasedcrawler.py @@ -11,6 +11,8 @@ from kakao import kakaocrawl from naver import navercrawl from facebook import facebookcrawl from facebook import facebookcrawlbs +from twitter import twittercrawl +from youtube import youtubecrawl from base.baseclasses import print_and_flush @@ -26,8 +28,12 @@ class WebBasedCrawler: self.crawler = kakaocrawl.KakaoMainCrawler() elif platform == "navercafe": self.crawler = navercrawl.NaverCafeMainAreaCrawler() - elif platform == "facebook": + elif platform == 'facebook': self.crawler = facebookcrawlbs.FacebookMainCrawler() + elif platform == 'twitter': + self.crawler = twittercrawl.TwitterCrawler() + elif platform == 'youtube': + self.crawler = youtubecrawl.YoutubeMainCrawler() else: self.crawler = None raise Exception @@ -38,7 +44,7 @@ class WebBasedCrawler: browser_opt = ('chrome', "ie", "opera", "firefox") -platform_opt = ('instagram', 'kakaostory', 'navercafe', "facebook") +platform_opt = ('instagram', 'kakaostory', 'navercafe', 'facebook', 'twitter', 'youtube') def get_browser_info(platform_, file_name="browser.txt"): @@ -73,7 +79,7 @@ def get_browser_info(platform_, file_name="browser.txt"): if __name__ == '__main__': """ sys.argv[0] webbasedcrawler.py - sys.argv[1] instagram, kakaochannel, navercafe, facebook + sys.argv[1] instagram, kakaochannel, navercafe, facebook, twitter, youtube sys.argv[2] keyword_id sys.argv[3] data group sys.argv[4] start_day @@ -85,8 +91,7 @@ if __name__ == '__main__': else: print_and_flush("Check Argumenets!") exit(1) - crawler = WebBasedCrawler(get_browser_info(sys.argv[1]), sys.argv[1], sys.argv[2], - sys.argv[3], sys.argv[4], sys.argv[5]) + crawler = WebBasedCrawler(get_browser_info(sys.argv[1]), sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5]) crawler.start() print_and_flush("Finished Crawling :)") exit(0) diff --git a/WebBasedCrawler/youtube/__init__.py b/WebBasedCrawler/youtube/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/WebBasedCrawler/youtube/youtubecrawl.py b/WebBasedCrawler/youtube/youtubecrawl.py new file mode 100644 index 0000000..bb34752 --- /dev/null +++ b/WebBasedCrawler/youtube/youtubecrawl.py @@ -0,0 +1,7 @@ + +class YoutubeMainCrawl: + def __init__(self): + pass + + def start(self): + pass \ No newline at end of file