Compare commits

6 Commits

Author SHA1 Message Date
mjjo
24587435b6 네이버 블로그, 카페 크롤러 문제 수정 2017-08-17 16:51:04 +09:00
mjjo
8854af26d6 gitignore 파일 수정 2017-08-14 19:00:24 +09:00
mjjo
ed7a6ddad9 인스타 content crawling 실패시 재시도 2017-08-14 18:56:43 +09:00
mjjo
87968097a9 args print 2017-08-11 12:27:16 +09:00
mjjo
3142782428 프록시에 lock 적용, exception 해결 2017-08-11 12:25:53 +09:00
mjjo
aa2f5b9f71 트위터 크롤러 상위 작성자 표시 2017-08-10 17:00:12 +09:00
13 changed files with 202 additions and 180 deletions

12
.gitignore vendored
View File

@@ -1,8 +1,12 @@
**/.idea/
**/__pycache__/
*.user
**/.idea/
**/build-*/
WebBasedCrawler/proxy.txt
clients-win/
clients-linux/
**/*.log
*.user
*.csv
*.log
*.bak
WebBasedCrawler/proxy.txt

View File

@@ -4,8 +4,8 @@
SNaverNewsManage::SNaverNewsManage(QObject *pObject) : SManage(pObject)
{
m_nID = 0;
connect(&m_pro[0], SIGNAL(readyReadStandardOutput()), this, SLOT());
m_nID = 0;
// connect(&m_pro[0], SIGNAL(readyReadStandardOutput()), this, SLOT());
}
QString SNaverNewsManage::makeGetListQuery(QString _str,QDate _date,int _nPage)

View File

@@ -10,7 +10,12 @@ using namespace std;
int main(int argc, char *argv[])
{
srand(time(0));
// cout << "arguments: ";
// for(int i=0; i<argc; i++)
// cout << " " << argv[i];
// cout << endl;
srand(time(0));
QApplication a(argc, argv);
a.setApplicationName(QString("Chrome"));
a.setApplicationVersion(QString("50.0.2661.102"));
@@ -39,5 +44,6 @@ int main(int argc, char *argv[])
process->load(strArgv);
a.exec();
delete process;
return 0;
}

View File

@@ -38,11 +38,18 @@ SCrawler::SCrawler():QObject()
m_nRetryCount = 0;
m_bProcessed = false;
connect(m_page, SIGNAL(loadFinished(bool)), this, SLOT(saveResult(bool)));
srand(time(NULL));
srand(time(NULL));
m_pNAM = new QNetworkAccessManager(this);
m_page->setNetworkAccessManager(m_pNAM);
}
SCrawler::~SCrawler()
{
m_page->setNetworkAccessManager(nullptr);
delete m_pNAM;
delete m_page;
}
void SCrawler::load(QStringList _strlistArgv)
@@ -159,22 +166,23 @@ void SCrawler::load(QStringList _strlistArgv)
}
cout << m_strUrl.toStdString() << endl;
QUrl url = QUrl(m_strUrl);
QUrl url = QUrl(m_strUrl);
if (url.scheme().isEmpty())
url.setScheme("http");
m_page->settings()->setAttribute(QWebSettings::AutoLoadImages,false);
QNetworkRequest *request = new QNetworkRequest;
request->setUrl(url);
QNetworkRequest request;
request.setUrl(url);
/*
request->setRawHeader("Cache-Control","max-age=0, no-cache");
request->setRawHeader("Pragma","no-cache");
request->setRawHeader("Expires","Thu, 01 Jan 1970 16:00:00 GMT");
request.setRawHeader("Cache-Control","max-age=0, no-cache");
request.setRawHeader("Pragma","no-cache");
request.setRawHeader("Expires","Thu, 01 Jan 1970 16:00:00 GMT");
*/
if (m_strReper.isEmpty() == false && m_nSelect == E_NAVER_CAFE_DATA)
request->setRawHeader("Referer",m_strReper.toLocal8Bit());
request->setRawHeader("Accept-Language","ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4,zh-CN;q=0.2,zh;q=0.2");
m_page->mainFrame()->load(*request);
if (m_strReper.isEmpty() == false && m_nSelect == E_NAVER_CAFE_DATA)
request.setRawHeader("Referer",m_strReper.toLocal8Bit());
request.setRawHeader("Accept-Language","ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4,zh-CN;q=0.2,zh;q=0.2");
m_page->mainFrame()->load(request);
m_bLast = false;
m_bError = false;
}
@@ -198,6 +206,8 @@ void SCrawler::saveResult(bool ok)
{
// qDebug() << "saveResult";
// cout << "page data: "<< m_page->bytesReceived() << endl;
if (!ok)
{
cout << "Failed loading";
@@ -209,14 +219,17 @@ void SCrawler::saveResult(bool ok)
//qDebug() << "load complete";
switch(m_nSelect)
{
case E_NAVER_NEWS_LIST:saveFrameNewsList(m_page->mainFrame());break;
case E_NAVER_NEWS_LIST:
saveFrameNewsList(m_page->mainFrame());
break;
case E_NAVER_NEWS_DATA:
{
static bool loaded = false;
if(!loaded)
{
loaded = true;
if(!saveFrameNewsUrl(m_page->mainFrame()))
if(saveFrameNewsUrl(m_page->mainFrame()) == false)
{
loaded = false;
return;
@@ -229,48 +242,57 @@ void SCrawler::saveResult(bool ok)
break;
}
case E_NAVER_NEWS_REPLY:
{
if(!saveFrameNewsComment(m_page->mainFrame()))
return;
break;
}
case E_NAVER_CAFE_LIST:saveFrameCafeList(m_page->mainFrame());break;
case E_NAVER_CAFE_LIST:
saveFrameCafeList(m_page->mainFrame());
break;
case E_NAVER_CAFE_DATA:
{
saveFrameCafeUrl(m_page->mainFrame());
bodydata.sendDB();
break;
}
case E_NAVER_BLOG_LIST:
{
if(saveFrameList(m_page->mainFrame()))
break;
else
return;
}
case E_NAVER_BLOG_BODY:
{
if(!saveFrameUrl(m_page->mainFrame()))
case E_NAVER_BLOG_LIST:
if(saveFrameList(m_page->mainFrame()) == false)
return;
break;
case E_NAVER_BLOG_BODY:
if(saveFrameUrl(m_page->mainFrame()) == false)
return;
bodydata.sendDB();
break;
}
case E_NAVER_BLOG_REPLY:saveFrameComment(m_page->mainFrame());break;
case E_DAUM_CAFE_LIST:saveFrameDaumCafeList(m_page->mainFrame());break;
case E_NAVER_BLOG_REPLY:
saveFrameComment(m_page->mainFrame());
break;
case E_DAUM_CAFE_LIST:
saveFrameDaumCafeList(m_page->mainFrame());
break;
case E_DAUM_CAFE_DATA:
{
saveFrameDaumCafeUrl(m_page->mainFrame());
bodydata.sendDB();
break;
}
case E_DAUM_BLOG_LIST:saveFrameDaumBlogList(m_page->mainFrame());break;
case E_DAUM_BLOG_LIST:
saveFrameDaumBlogList(m_page->mainFrame());
break;
case E_DAUM_BLOG_BODY:
{
saveFrameDaumBlogUrl(m_page->mainFrame());
bodydata.sendDB();
break;
}
case E_DAUM_BLOG_REPLY:saveFrameDaumBlogComment(m_page->mainFrame());break;
case E_DAUM_BLOG_REPLY:
saveFrameDaumBlogComment(m_page->mainFrame());
break;
}
switch(m_nSelect)
@@ -296,8 +318,8 @@ void SCrawler::saveResult(bool ok)
cout << "last";
m_bLast = false;
}
break;
case E_NAVER_BLOG_REPLY:
case E_NAVER_NEWS_REPLY:
case E_DAUM_BLOG_REPLY:
@@ -327,9 +349,9 @@ void SCrawler::saveResult(bool ok)
}
break;
}
qDebug() << "finish";
emit finished();
qDebug() << " finish";
emit finished();
}
int SCrawler::GetNumber(QString _str)
@@ -407,15 +429,13 @@ void SCrawler::reloadListPage()
bool SCrawler::saveFrameList(QWebFrame *frame)
{
if (m_bProcessed == false)
m_bProcessed = true;
else
return false;
//qDebug() << frame->documentElement().toPlainText();
if (m_bUse == true) return true;
if (m_bUse == true)
return true;
QWebElement notFound = Find(frame->documentElement(),"div","id","notfound");
if(notFound.isNull() == false)
{
@@ -577,7 +597,11 @@ bool SCrawler::saveFrameList(QWebFrame *frame)
{
QWebElement total = Find(eleMain,"span","class","title_num");
if (total.toPlainText().isEmpty()) {m_bError = true; return true;}
if (total.toPlainText().isEmpty())
{
m_bError = true;
return true;
}
int nTotal = GetNumber(total.toPlainText().split("/").at(1));
QStringList strList = m_strUrl.split("&");
bool ok = false;
@@ -817,7 +841,7 @@ bool SCrawler::saveFrameUrl(QWebFrame *frame)
void SCrawler::reloadPage()
{
//qDebug() << "reloadPage called";
// qDebug() << "reloadPage called";
saveResult(true);
}
@@ -1059,7 +1083,11 @@ void SCrawler::saveFrameCafeList(QWebFrame *frame)
{
QWebElement total = Find(eleMain,"span","class","title_num");
if (total.toPlainText().isEmpty()) {m_bError = true; return;}
if (total.toPlainText().isEmpty())
{
m_bError = true;
return;
}
total.toPlainText().split("/").size();
int nTotal = GetNumber(total.toPlainText().split("/").at(1));
QStringList strList = m_strUrl.split("&");
@@ -1377,7 +1405,11 @@ void SCrawler::saveFrameDaumCafeList(QWebFrame *frame)
b_last = b_last | !(Find(frame->documentElement(), "div", "class", "result_message mg_cont").isNull());
QWebElement total = Find(eleMain,"span","class","f_nb f_l");
if (total.toPlainText().isEmpty()) {m_bError = true; return;}
if (total.toPlainText().isEmpty())
{
m_bError = true;
return;
}
QString strTotal = total.toPlainText().split("/").at(1);
strTotal = strTotal.replace(",","");
@@ -1635,47 +1667,66 @@ void SCrawler::saveFrameDaumBlogList(QWebFrame *frame){}
void SCrawler::saveFrameNewsList(QWebFrame *frame)
{
if (m_bUse == true) return;
if (m_bUse == true)
return;
// QFile file("pagedata.txt");
// if ( file.open(QIODevice::ReadWrite) )
// {
// QTextStream stream( &file );
// stream << frame->documentElement().toOuterXml() << endl;
// file.close();
// }
QWebElement notFound = Find(frame->documentElement(),"div","class","no_content");
if(notFound.isNull() == false)
{
m_bLast = true;
return;
}
QWebElement eleMain = Find(frame->documentElement(),"div","class","srch_result_area headline");
foreach(QWebElement eleSub,eleMain.findAll("div"))
{
if (eleSub.attribute("class") == QString("info"))
{
QString str = Find(eleSub,"a","class","go_naver").attribute("href");
if (str.trimmed().isEmpty()) continue;
if (str.contains("http://sports")) continue;
if (str.trimmed().isEmpty())
continue;
if (str.contains("http://sports"))
continue;
m_bNothing = true;
cout << "o " << str.toStdString() << endl;
}
}
QStringList strTotal = bodydata.GetNumber(Find(frame->documentElement(),"span","class","result_num").toPlainText().trimmed());
QVector <int> vecTotal;
foreach(QString str,strTotal)
{
if (str.trimmed().isEmpty() == false)
vecTotal.push_back(str.toInt());
}
// QStringList strTotal = bodydata.GetNumber(Find(frame->documentElement(),"span","class","result_num").toPlainText().trimmed());
QWebElement Total = Find(frame->documentElement(), "div", "class", "title_desc");
QStringList nums = bodydata.GetNumber(Total.toPlainText());
if(nums.count() < 3)
{
m_bError = true;
m_bUse = true;
return;
}
QVector <int> vecTotal;
vecTotal.push_back(nums[0].toInt());
vecTotal.push_back(nums[1].toInt());
vecTotal.push_back(nums[2].toInt());
if (vecTotal[0] >= vecTotal[1] || vecTotal[1] == vecTotal[2])
m_bLast = true;
if (vecTotal.size() == 3)
{
if (vecTotal[0] >= vecTotal[1]) m_bLast = true;
if (vecTotal[1] == vecTotal[2]) m_bLast = true;
}
else
m_bError = true;
m_bUse = true;
}
bool SCrawler::saveFrameNewsUrl(QWebFrame *frame)
{
if (m_bUse) return true;
if (m_bUse)
return true;
{
QString strQuery = "delete from ";
@@ -2386,21 +2437,27 @@ bool SCrawler::setProxyFromFile()
//QNetworkAccessManager *manager = new QNetworkAccessManager;
switch(strList.size())
{
{
case 1:
cout << "p : " << strList.at(0).toStdString() << " from File" << endl;
{
cout << "p : " << strList.at(0).toStdString() << " from File" << endl;
//manager->setProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0))));
//m_page->setNetworkAccessManager(manager);
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0))));
break;
QNetworkProxy proxy(QNetworkProxy::HttpProxy, strList.at(0));
QNetworkProxy::setApplicationProxy(proxy);
}
break;
case 2:
cout << "p : " << strList.at(0).toStdString() << ":" << strList.at(1).toStdString() << " from File" << endl;
{
cout << "p : " << strList.at(0).toStdString() << ":" << strList.at(1).toStdString() << " from File" << endl;
//manager->setProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt())));
//m_page->setNetworkAccessManager(manager);
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt())));
QNetworkProxy proxy(QNetworkProxy::HttpProxy, strList.at(0), strList.at(1).toInt());
QNetworkProxy::setApplicationProxy(proxy);
}
break;
}
}
@@ -2439,11 +2496,12 @@ bool SCrawler::setProxyFromDb()
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0))));
break;
case 2:
cout << "p : " << strList.at(0).toStdString() << ":" << strList.at(1).toStdString() << " from DB" << endl;
m_strProxyIP = strList.at(0);
m_nProxyPort = strList.at(1).toInt();
m_strProxyIP = strList.at(0);
m_nProxyPort = strList.at(1).toInt();
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt())));
cout << "p : " << m_strProxyIP.toStdString() << ":" << m_nProxyPort << " from DB" << endl;
//QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy, m_strProxyIP, m_nProxyPort)));
/*
QString strProxyHost = "61.103.7.74";
int nPort = 2074;
@@ -2468,7 +2526,6 @@ bool SCrawler::setProxyFromDb()
void SCrawler::setProxy()
{
bool ok = setProxyFromFile() || setProxyFromDb();
//bool ok = false;
if (!ok)
cout << "No Proxy" << endl;
}

View File

@@ -6,6 +6,7 @@
class SCrawler : public QObject
{
Q_OBJECT
public:
enum E_SELECT
{
@@ -25,7 +26,7 @@ public:
};
public:
SCrawler();
~SCrawler();
virtual ~SCrawler();
void load(QStringList _strlistArgv);
void saveFile();
static void Debug(QString _strFilename,QString _strData);
@@ -35,6 +36,7 @@ private slots:
void saveResult(bool ok);
void reloadPage();
void reloadListPage();
private:
int m_nSelect;
QString m_strReper;
@@ -43,6 +45,7 @@ private:
SCrawlerData bodydata;
QWebPage *m_page;
QNetworkAccessManager* m_pNAM;
QString m_strFile;
QString m_strUrl;
QString m_strTable;

View File

@@ -136,6 +136,9 @@ QStringList SCrawlerData::GetNumber(QString _str)
{
if (pch[i].isNumber() || pch[i].isSpace())
str += pch[i];
else if(pch[i] != ',' && pch[i] != '.')
str += ' ';
}
return str.trimmed().split(" ");
return str.trimmed().split(" ", QString::SkipEmptyParts);
}

View File

@@ -25,9 +25,13 @@ from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
def is_debugger_attached():
for frame in inspect.stack():
if frame[1].endswith("pydevd.py"):
return True
try:
for frame in inspect.stack():
if frame[1].endswith("pydevd.py"):
return True
except:
return False
return False
is_debug = is_debugger_attached()

View File

@@ -103,32 +103,27 @@ class Proxy2Handler:
def lock_enter(self):
# logger.log('lock {}'.format(threading.current_thread().ident))
# self.lock.acquire()
self.lock.acquire()
pass
def lock_leave(self):
# self.lock.release()
self.lock.release()
# logger.log('unlock {}'.format(threading.current_thread().ident))
pass
def commit(self):
self.lock_enter()
# self.session.commit()
self.lock_leave()
pass
def get_oldest(self, platform):
self.lock_enter()
instance = self.session.query(Proxy2Model).order_by(self.block_field_map[platform].desc()).first()
self.lock_leave()
return instance
def get_query(self, ip, port):
return self.session.query(Proxy2Model).filter_by(ip=ip).filter_by(port=port)
def get_instance(self, ip, port):
self.lock_enter()
instance = self.get_query(ip, port).first()
self.lock_leave()
return instance
def check_all_proxies(self, platform):
@@ -161,7 +156,7 @@ class Proxy2Handler:
if resp.ok:
instance.set_block_at(platform, None)
alive_cnt += 1
print('proxy {}:{} alive'.format(instance.ip, instance.port))
# print('proxy {}:{} alive'.format(instance.ip, instance.port))
else:
instance.set_block_at(platform, datetime.datetime.now())
@@ -171,82 +166,40 @@ class Proxy2Handler:
def get(self, platform, proc_id=-1):
self.lock_enter()
try:
block_column = self.block_field_map[platform]
instances = self.session.query(Proxy2Model).filter(block_column == None).limit(32).all()
instance = instances[random.randint(0, len(instances)-1)] if len(instances) > 0 else None
proxy = None
if instance:
proxy = instance.get_instance_for_http()
else:
cnt = self.check_all_proxies(platform)
if cnt <= 0:
proxies = proxy_crawler.crawl_proxies()
self.insert_all(proxies)
block_column = self.block_field_map[platform]
try:
instances = self.session.query(Proxy2Model).filter(block_column == None).limit(32).all()
except Exception as e:
dbg.print_exception()
assert True
self.lock_leave()
# try:
# session_factory = sqlalchemy.orm.sessionmaker(bind=self.engine)
# self.session = sqlalchemy.orm.scoped_session(session_factory)
# logger.log('{} session recreate'.format(proc_id))
#
# except Exception as e2:
# dbg.print_exception(e2)
return None
instance = None
if len(instances) > 0:
instance = instances[random.randint(0, len(instances)-1)] if len(instances) > 0 else None
if instance:
self.lock_leave()
return instance.get_instance_for_http()
else:
cnt = self.check_all_proxies(platform)
if cnt <= 0:
proxies = proxy_crawler.crawl_proxies()
self.insert_all(proxies)
self.lock_leave()
return self.get(platform, proc_id)
except Exception as e:
dbg.print_exception(e)
self.lock_leave()
return proxy
def insert(self, ip, port):
instance = self.get_instance(ip, port)
if not instance:
proxy = Proxy2Model(ip, port)
self.lock_enter()
self.session.add(proxy)
self.lock_leave()
self.commit()
def insert_all(self, proxies):
print('{} proxy insert start'.format(len(proxies)))
# INSERT INTO proxy2(ip, PORT)
# SELECT <ip>, <port> FROM DUAL
# WHERE NOT EXISTS (SELECT * FROM proxy2 WHERE ip=<ip> AND PORT=<port>)
self.lock.acquire()
for proxy in proxies:
query = r"INSERT INTO proxy2(ip, PORT) " \
r"SELECT '{}', {} FROM DUAL " \
r"WHERE NOT EXISTS (SELECT * FROM proxy2 WHERE ip='{}' AND PORT={})"\
.format(proxy['ip'], proxy['port'], proxy['ip'], proxy['port'])
# 안됨 - 중복으로 들어감, 쓰레드 종료됨
self.engine.execute(query)
self.lock.release()
# self.query(Proxy2Model).insert()
#
# self.query(Proxy2Model).filter(Proxy2Model.ip == proxy['ip']).filter(Proxy2Model.port == proxy['port']).\
# filter(
# ~sqlalchemy.exists().where(
# sqlalchemy.and_(
# Proxy2Model.kw_id == Proxy2Model.kw_id,
# Proxy2Model.checkpoint_id == Proxy2Model.id
# )
# )
# )
#
# if self.session.query(Proxy2Model).filter_by(ip=proxy['ip']).filter_by(port=proxy['port']).count() == 0:
# self.session.add(Proxy2Model(proxy['ip'], proxy['port']))
print('{} proxy insert end'.format(len(proxies)))
def set_proxy_blocked(self, ip, port, platform):
try:

View File

@@ -86,13 +86,14 @@ def check_proxy(qu, proxy, url):
def crawl_proxies(check_url=None):
# print('proxy crawling start')
proxies = get_proxies_free_proxy()
print('proxy crawling start')
proxies = []
proxies += get_proxies_free_proxy()
proxies += get_proxies_proxy_searcher()
# proxies += get_proxies_nntime()
# proxies = list(set(proxies))
# print('proxy crawled {}'.format(len(proxies)))
proxies_alive = []
if check_url:
qu = queue.Queue()
threads = []
@@ -103,7 +104,6 @@ def crawl_proxies(check_url=None):
[th.start() for th in threads]
[th.join() for th in threads]
proxies_alive = []
while not qu.empty():
proxy = qu.get()
proxies_alive.append(proxy)
@@ -111,21 +111,9 @@ def crawl_proxies(check_url=None):
else:
proxies_alive = proxies
# print('proxy crawling end')
print('proxy crawled {}'.format(len(proxies_alive)))
return proxies_alive
# proxies_alive.sort()
# print('proxy crawler got {} proxies'.format(len(proxies_alive)))
#
# with open('proxy.txt', 'w') as f:
# print('proxy crawler dump start')
# for proxy in proxies_alive:
# # print(proxy)
# f.write(proxy + '\n')
# print('proxy crawler dump end')
#
# print('proxy crawling end')
if __name__ == '__main__':

View File

@@ -236,7 +236,7 @@ def make_list_instance(url, proxies=None):
return None
# @instance_wrapper
@instance_wrapper
def make_content_instance(url, proxies=None):
try:
content = InstaContent(url, {}, url, proxies)
@@ -265,7 +265,7 @@ def ajax_wrapper(func):
return retry_ajax_load
# @ajax_wrapper
@ajax_wrapper
def load_ajax_list(ins):
try:
insta_list = ins.load_more()
@@ -280,7 +280,7 @@ def load_ajax_list(ins):
return None
# @ajax_wrapper
@ajax_wrapper
def load_ajax_reply(ins):
try:
replies = ins.load_reply_more()
@@ -978,8 +978,9 @@ class InstaAlgorithmMulti(InstaAlgorithm):
self.total_num += 1
if self.is_until_page():
return False
# if self.list_crawl:
# printl("Number of Lists = {0}".format(len(self.list_crawl)))
# printl("Number of Lists = {0}".format(self.list_crawl.qsize()))
return True
def crawl(self):

View File

@@ -227,7 +227,7 @@ class TwitterCrawler:
for container_tags in reply_container_tags:
tweet_tags = container_tags.select('div.tweet')
if len(tweet_tags) > 0:
tweet = TweetParser.parse(tweet_tags[0], self.default_config.keyword_id, parent_tw.depth+1, top_tw)
tweet = TweetParser.parse(tweet_tags[0], self.default_config.keyword_id, parent_tw.depth+1, parent_tw, top_tw)
# print('[{}]>>> {} {}: {} ({}) ({})'.format(proc_id, tweet.created_at, tweet.user_name, tweet.text[:20], tweet.depth, tweet.tweet_link))
print('[{}] reply {} [{}]'.format(proc_id, tweet.top_link, 'ok'))
self.insert_content_pool(proc_id, content_queue, tweet, top_tw)

View File

@@ -5,10 +5,11 @@ import bs4
import datetime
import pytz
class TweetParser:
@staticmethod
def parse(tag, keyword_id, depth=0, top_tw: Tweet=None):
def parse(tag, keyword_id, depth=0, parent_tw: Tweet=None, top_tw: Tweet=None):
tweet = Tweet()
tweet.tweet_id = int(tag.attrs['data-tweet-id'])
@@ -62,7 +63,7 @@ class TweetParser:
tweet.platform_form = 'post'
tweet.platform_title = top_tw.user_id if top_tw else tweet.user_id
tweet.article_form = 'body' if tweet.depth is 0 else 'reply'
# tweet.article_parent = None
tweet.article_parent = parent_tw.user_name if parent_tw else None
tweet.article_id = tweet.user_id
tweet.article_nickname = tweet.user_name
# tweet.article_title = None

View File

@@ -86,6 +86,8 @@ if __name__ == '__main__':
sys.argv[5] until_page
"""
print("arguments: {}".format(' '.join(sys.argv)))
if len(sys.argv) == 6:
print_and_flush("Python Crawling Executed")
else: