Compare commits

6 Commits

Author SHA1 Message Date
mjjo
24587435b6 네이버 블로그, 카페 크롤러 문제 수정 2017-08-17 16:51:04 +09:00
mjjo
8854af26d6 gitignore 파일 수정 2017-08-14 19:00:24 +09:00
mjjo
ed7a6ddad9 인스타 content crawling 실패시 재시도 2017-08-14 18:56:43 +09:00
mjjo
87968097a9 args print 2017-08-11 12:27:16 +09:00
mjjo
3142782428 프록시에 lock 적용, exception 해결 2017-08-11 12:25:53 +09:00
mjjo
aa2f5b9f71 트위터 크롤러 상위 작성자 표시 2017-08-10 17:00:12 +09:00
13 changed files with 202 additions and 180 deletions

12
.gitignore vendored
View File

@@ -1,8 +1,12 @@
**/.idea/
**/__pycache__/ **/__pycache__/
*.user **/.idea/
**/build-*/ **/build-*/
WebBasedCrawler/proxy.txt
clients-win/ clients-win/
clients-linux/ clients-linux/
**/*.log
*.user
*.csv
*.log
*.bak
WebBasedCrawler/proxy.txt

View File

@@ -5,7 +5,7 @@
SNaverNewsManage::SNaverNewsManage(QObject *pObject) : SManage(pObject) SNaverNewsManage::SNaverNewsManage(QObject *pObject) : SManage(pObject)
{ {
m_nID = 0; m_nID = 0;
connect(&m_pro[0], SIGNAL(readyReadStandardOutput()), this, SLOT()); // connect(&m_pro[0], SIGNAL(readyReadStandardOutput()), this, SLOT());
} }
QString SNaverNewsManage::makeGetListQuery(QString _str,QDate _date,int _nPage) QString SNaverNewsManage::makeGetListQuery(QString _str,QDate _date,int _nPage)

View File

@@ -10,6 +10,11 @@ using namespace std;
int main(int argc, char *argv[]) int main(int argc, char *argv[])
{ {
// cout << "arguments: ";
// for(int i=0; i<argc; i++)
// cout << " " << argv[i];
// cout << endl;
srand(time(0)); srand(time(0));
QApplication a(argc, argv); QApplication a(argc, argv);
a.setApplicationName(QString("Chrome")); a.setApplicationName(QString("Chrome"));
@@ -39,5 +44,6 @@ int main(int argc, char *argv[])
process->load(strArgv); process->load(strArgv);
a.exec(); a.exec();
delete process;
return 0; return 0;
} }

View File

@@ -39,10 +39,17 @@ SCrawler::SCrawler():QObject()
m_bProcessed = false; m_bProcessed = false;
connect(m_page, SIGNAL(loadFinished(bool)), this, SLOT(saveResult(bool))); connect(m_page, SIGNAL(loadFinished(bool)), this, SLOT(saveResult(bool)));
srand(time(NULL)); srand(time(NULL));
m_pNAM = new QNetworkAccessManager(this);
m_page->setNetworkAccessManager(m_pNAM);
} }
SCrawler::~SCrawler() SCrawler::~SCrawler()
{ {
m_page->setNetworkAccessManager(nullptr);
delete m_pNAM;
delete m_page;
} }
void SCrawler::load(QStringList _strlistArgv) void SCrawler::load(QStringList _strlistArgv)
@@ -164,17 +171,18 @@ void SCrawler::load(QStringList _strlistArgv)
url.setScheme("http"); url.setScheme("http");
m_page->settings()->setAttribute(QWebSettings::AutoLoadImages,false); m_page->settings()->setAttribute(QWebSettings::AutoLoadImages,false);
QNetworkRequest *request = new QNetworkRequest; QNetworkRequest request;
request->setUrl(url); request.setUrl(url);
/* /*
request->setRawHeader("Cache-Control","max-age=0, no-cache"); request.setRawHeader("Cache-Control","max-age=0, no-cache");
request->setRawHeader("Pragma","no-cache"); request.setRawHeader("Pragma","no-cache");
request->setRawHeader("Expires","Thu, 01 Jan 1970 16:00:00 GMT"); request.setRawHeader("Expires","Thu, 01 Jan 1970 16:00:00 GMT");
*/ */
if (m_strReper.isEmpty() == false && m_nSelect == E_NAVER_CAFE_DATA) if (m_strReper.isEmpty() == false && m_nSelect == E_NAVER_CAFE_DATA)
request->setRawHeader("Referer",m_strReper.toLocal8Bit()); request.setRawHeader("Referer",m_strReper.toLocal8Bit());
request->setRawHeader("Accept-Language","ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4,zh-CN;q=0.2,zh;q=0.2"); request.setRawHeader("Accept-Language","ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4,zh-CN;q=0.2,zh;q=0.2");
m_page->mainFrame()->load(*request); m_page->mainFrame()->load(request);
m_bLast = false; m_bLast = false;
m_bError = false; m_bError = false;
} }
@@ -198,6 +206,8 @@ void SCrawler::saveResult(bool ok)
{ {
// qDebug() << "saveResult"; // qDebug() << "saveResult";
// cout << "page data: "<< m_page->bytesReceived() << endl;
if (!ok) if (!ok)
{ {
cout << "Failed loading"; cout << "Failed loading";
@@ -209,14 +219,17 @@ void SCrawler::saveResult(bool ok)
//qDebug() << "load complete"; //qDebug() << "load complete";
switch(m_nSelect) switch(m_nSelect)
{ {
case E_NAVER_NEWS_LIST:saveFrameNewsList(m_page->mainFrame());break; case E_NAVER_NEWS_LIST:
saveFrameNewsList(m_page->mainFrame());
break;
case E_NAVER_NEWS_DATA: case E_NAVER_NEWS_DATA:
{ {
static bool loaded = false; static bool loaded = false;
if(!loaded) if(!loaded)
{ {
loaded = true; loaded = true;
if(!saveFrameNewsUrl(m_page->mainFrame())) if(saveFrameNewsUrl(m_page->mainFrame()) == false)
{ {
loaded = false; loaded = false;
return; return;
@@ -229,48 +242,57 @@ void SCrawler::saveResult(bool ok)
break; break;
} }
case E_NAVER_NEWS_REPLY: case E_NAVER_NEWS_REPLY:
{
if(!saveFrameNewsComment(m_page->mainFrame())) if(!saveFrameNewsComment(m_page->mainFrame()))
return; return;
break; break;
}
case E_NAVER_CAFE_LIST:saveFrameCafeList(m_page->mainFrame());break; case E_NAVER_CAFE_LIST:
saveFrameCafeList(m_page->mainFrame());
break;
case E_NAVER_CAFE_DATA: case E_NAVER_CAFE_DATA:
{
saveFrameCafeUrl(m_page->mainFrame()); saveFrameCafeUrl(m_page->mainFrame());
bodydata.sendDB(); bodydata.sendDB();
break; break;
}
case E_NAVER_BLOG_LIST: case E_NAVER_BLOG_LIST:
{ if(saveFrameList(m_page->mainFrame()) == false)
if(saveFrameList(m_page->mainFrame())) return;
break; break;
else
return;
}
case E_NAVER_BLOG_BODY: case E_NAVER_BLOG_BODY:
{ if(saveFrameUrl(m_page->mainFrame()) == false)
if(!saveFrameUrl(m_page->mainFrame()))
return; return;
bodydata.sendDB(); bodydata.sendDB();
break; break;
}
case E_NAVER_BLOG_REPLY:saveFrameComment(m_page->mainFrame());break; case E_NAVER_BLOG_REPLY:
case E_DAUM_CAFE_LIST:saveFrameDaumCafeList(m_page->mainFrame());break; saveFrameComment(m_page->mainFrame());
break;
case E_DAUM_CAFE_LIST:
saveFrameDaumCafeList(m_page->mainFrame());
break;
case E_DAUM_CAFE_DATA: case E_DAUM_CAFE_DATA:
{
saveFrameDaumCafeUrl(m_page->mainFrame()); saveFrameDaumCafeUrl(m_page->mainFrame());
bodydata.sendDB(); bodydata.sendDB();
break; break;
}
case E_DAUM_BLOG_LIST:saveFrameDaumBlogList(m_page->mainFrame());break; case E_DAUM_BLOG_LIST:
saveFrameDaumBlogList(m_page->mainFrame());
break;
case E_DAUM_BLOG_BODY: case E_DAUM_BLOG_BODY:
{
saveFrameDaumBlogUrl(m_page->mainFrame()); saveFrameDaumBlogUrl(m_page->mainFrame());
bodydata.sendDB(); bodydata.sendDB();
break; break;
}
case E_DAUM_BLOG_REPLY:saveFrameDaumBlogComment(m_page->mainFrame());break; case E_DAUM_BLOG_REPLY:
saveFrameDaumBlogComment(m_page->mainFrame());
break;
} }
switch(m_nSelect) switch(m_nSelect)
@@ -296,8 +318,8 @@ void SCrawler::saveResult(bool ok)
cout << "last"; cout << "last";
m_bLast = false; m_bLast = false;
} }
break; break;
case E_NAVER_BLOG_REPLY: case E_NAVER_BLOG_REPLY:
case E_NAVER_NEWS_REPLY: case E_NAVER_NEWS_REPLY:
case E_DAUM_BLOG_REPLY: case E_DAUM_BLOG_REPLY:
@@ -327,9 +349,9 @@ void SCrawler::saveResult(bool ok)
} }
break; break;
} }
qDebug() << "finish";
emit finished();
qDebug() << " finish";
emit finished();
} }
int SCrawler::GetNumber(QString _str) int SCrawler::GetNumber(QString _str)
@@ -407,15 +429,13 @@ void SCrawler::reloadListPage()
bool SCrawler::saveFrameList(QWebFrame *frame) bool SCrawler::saveFrameList(QWebFrame *frame)
{ {
if (m_bProcessed == false) if (m_bProcessed == false)
m_bProcessed = true; m_bProcessed = true;
else else
return false; return false;
//qDebug() << frame->documentElement().toPlainText(); if (m_bUse == true)
return true;
if (m_bUse == true) return true;
QWebElement notFound = Find(frame->documentElement(),"div","id","notfound"); QWebElement notFound = Find(frame->documentElement(),"div","id","notfound");
if(notFound.isNull() == false) if(notFound.isNull() == false)
{ {
@@ -577,7 +597,11 @@ bool SCrawler::saveFrameList(QWebFrame *frame)
{ {
QWebElement total = Find(eleMain,"span","class","title_num"); QWebElement total = Find(eleMain,"span","class","title_num");
if (total.toPlainText().isEmpty()) {m_bError = true; return true;} if (total.toPlainText().isEmpty())
{
m_bError = true;
return true;
}
int nTotal = GetNumber(total.toPlainText().split("/").at(1)); int nTotal = GetNumber(total.toPlainText().split("/").at(1));
QStringList strList = m_strUrl.split("&"); QStringList strList = m_strUrl.split("&");
bool ok = false; bool ok = false;
@@ -817,7 +841,7 @@ bool SCrawler::saveFrameUrl(QWebFrame *frame)
void SCrawler::reloadPage() void SCrawler::reloadPage()
{ {
//qDebug() << "reloadPage called"; // qDebug() << "reloadPage called";
saveResult(true); saveResult(true);
} }
@@ -1059,7 +1083,11 @@ void SCrawler::saveFrameCafeList(QWebFrame *frame)
{ {
QWebElement total = Find(eleMain,"span","class","title_num"); QWebElement total = Find(eleMain,"span","class","title_num");
if (total.toPlainText().isEmpty()) {m_bError = true; return;} if (total.toPlainText().isEmpty())
{
m_bError = true;
return;
}
total.toPlainText().split("/").size(); total.toPlainText().split("/").size();
int nTotal = GetNumber(total.toPlainText().split("/").at(1)); int nTotal = GetNumber(total.toPlainText().split("/").at(1));
QStringList strList = m_strUrl.split("&"); QStringList strList = m_strUrl.split("&");
@@ -1377,7 +1405,11 @@ void SCrawler::saveFrameDaumCafeList(QWebFrame *frame)
b_last = b_last | !(Find(frame->documentElement(), "div", "class", "result_message mg_cont").isNull()); b_last = b_last | !(Find(frame->documentElement(), "div", "class", "result_message mg_cont").isNull());
QWebElement total = Find(eleMain,"span","class","f_nb f_l"); QWebElement total = Find(eleMain,"span","class","f_nb f_l");
if (total.toPlainText().isEmpty()) {m_bError = true; return;} if (total.toPlainText().isEmpty())
{
m_bError = true;
return;
}
QString strTotal = total.toPlainText().split("/").at(1); QString strTotal = total.toPlainText().split("/").at(1);
strTotal = strTotal.replace(",",""); strTotal = strTotal.replace(",","");
@@ -1635,47 +1667,66 @@ void SCrawler::saveFrameDaumBlogList(QWebFrame *frame){}
void SCrawler::saveFrameNewsList(QWebFrame *frame) void SCrawler::saveFrameNewsList(QWebFrame *frame)
{ {
if (m_bUse == true) return; if (m_bUse == true)
return;
// QFile file("pagedata.txt");
// if ( file.open(QIODevice::ReadWrite) )
// {
// QTextStream stream( &file );
// stream << frame->documentElement().toOuterXml() << endl;
// file.close();
// }
QWebElement notFound = Find(frame->documentElement(),"div","class","no_content"); QWebElement notFound = Find(frame->documentElement(),"div","class","no_content");
if(notFound.isNull() == false) if(notFound.isNull() == false)
{ {
m_bLast = true; m_bLast = true;
return; return;
} }
QWebElement eleMain = Find(frame->documentElement(),"div","class","srch_result_area headline"); QWebElement eleMain = Find(frame->documentElement(),"div","class","srch_result_area headline");
foreach(QWebElement eleSub,eleMain.findAll("div")) foreach(QWebElement eleSub,eleMain.findAll("div"))
{ {
if (eleSub.attribute("class") == QString("info")) if (eleSub.attribute("class") == QString("info"))
{ {
QString str = Find(eleSub,"a","class","go_naver").attribute("href"); QString str = Find(eleSub,"a","class","go_naver").attribute("href");
if (str.trimmed().isEmpty()) continue; if (str.trimmed().isEmpty())
if (str.contains("http://sports")) continue; continue;
if (str.contains("http://sports"))
continue;
m_bNothing = true; m_bNothing = true;
cout << "o " << str.toStdString() << endl; cout << "o " << str.toStdString() << endl;
} }
} }
QStringList strTotal = bodydata.GetNumber(Find(frame->documentElement(),"span","class","result_num").toPlainText().trimmed()); // QStringList strTotal = bodydata.GetNumber(Find(frame->documentElement(),"span","class","result_num").toPlainText().trimmed());
QVector <int> vecTotal; QWebElement Total = Find(frame->documentElement(), "div", "class", "title_desc");
foreach(QString str,strTotal) QStringList nums = bodydata.GetNumber(Total.toPlainText());
if(nums.count() < 3)
{ {
if (str.trimmed().isEmpty() == false) m_bError = true;
vecTotal.push_back(str.toInt()); m_bUse = true;
return;
} }
if (vecTotal.size() == 3) QVector <int> vecTotal;
{ vecTotal.push_back(nums[0].toInt());
if (vecTotal[0] >= vecTotal[1]) m_bLast = true; vecTotal.push_back(nums[1].toInt());
if (vecTotal[1] == vecTotal[2]) m_bLast = true; vecTotal.push_back(nums[2].toInt());
}
else if (vecTotal[0] >= vecTotal[1] || vecTotal[1] == vecTotal[2])
m_bError = true; m_bLast = true;
m_bUse = true; m_bUse = true;
} }
bool SCrawler::saveFrameNewsUrl(QWebFrame *frame) bool SCrawler::saveFrameNewsUrl(QWebFrame *frame)
{ {
if (m_bUse) return true; if (m_bUse)
return true;
{ {
QString strQuery = "delete from "; QString strQuery = "delete from ";
@@ -2388,19 +2439,25 @@ bool SCrawler::setProxyFromFile()
switch(strList.size()) switch(strList.size())
{ {
case 1: case 1:
{
cout << "p : " << strList.at(0).toStdString() << " from File" << endl; cout << "p : " << strList.at(0).toStdString() << " from File" << endl;
//manager->setProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0)))); //manager->setProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0))));
//m_page->setNetworkAccessManager(manager); //m_page->setNetworkAccessManager(manager);
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0)))); QNetworkProxy proxy(QNetworkProxy::HttpProxy, strList.at(0));
QNetworkProxy::setApplicationProxy(proxy);
}
break; break;
case 2: case 2:
{
cout << "p : " << strList.at(0).toStdString() << ":" << strList.at(1).toStdString() << " from File" << endl; cout << "p : " << strList.at(0).toStdString() << ":" << strList.at(1).toStdString() << " from File" << endl;
//manager->setProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt()))); //manager->setProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt())));
//m_page->setNetworkAccessManager(manager); //m_page->setNetworkAccessManager(manager);
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt()))); QNetworkProxy proxy(QNetworkProxy::HttpProxy, strList.at(0), strList.at(1).toInt());
QNetworkProxy::setApplicationProxy(proxy);
}
break; break;
} }
} }
@@ -2439,11 +2496,12 @@ bool SCrawler::setProxyFromDb()
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0)))); QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0))));
break; break;
case 2: case 2:
cout << "p : " << strList.at(0).toStdString() << ":" << strList.at(1).toStdString() << " from DB" << endl;
m_strProxyIP = strList.at(0); m_strProxyIP = strList.at(0);
m_nProxyPort = strList.at(1).toInt(); m_nProxyPort = strList.at(1).toInt();
QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy,strList.at(0),strList.at(1).toInt()))); cout << "p : " << m_strProxyIP.toStdString() << ":" << m_nProxyPort << " from DB" << endl;
//QNetworkProxy::setApplicationProxy(*(new QNetworkProxy(QNetworkProxy::HttpProxy, m_strProxyIP, m_nProxyPort)));
/* /*
QString strProxyHost = "61.103.7.74"; QString strProxyHost = "61.103.7.74";
int nPort = 2074; int nPort = 2074;
@@ -2468,7 +2526,6 @@ bool SCrawler::setProxyFromDb()
void SCrawler::setProxy() void SCrawler::setProxy()
{ {
bool ok = setProxyFromFile() || setProxyFromDb(); bool ok = setProxyFromFile() || setProxyFromDb();
//bool ok = false;
if (!ok) if (!ok)
cout << "No Proxy" << endl; cout << "No Proxy" << endl;
} }

View File

@@ -6,6 +6,7 @@
class SCrawler : public QObject class SCrawler : public QObject
{ {
Q_OBJECT Q_OBJECT
public: public:
enum E_SELECT enum E_SELECT
{ {
@@ -25,7 +26,7 @@ public:
}; };
public: public:
SCrawler(); SCrawler();
~SCrawler(); virtual ~SCrawler();
void load(QStringList _strlistArgv); void load(QStringList _strlistArgv);
void saveFile(); void saveFile();
static void Debug(QString _strFilename,QString _strData); static void Debug(QString _strFilename,QString _strData);
@@ -35,6 +36,7 @@ private slots:
void saveResult(bool ok); void saveResult(bool ok);
void reloadPage(); void reloadPage();
void reloadListPage(); void reloadListPage();
private: private:
int m_nSelect; int m_nSelect;
QString m_strReper; QString m_strReper;
@@ -43,6 +45,7 @@ private:
SCrawlerData bodydata; SCrawlerData bodydata;
QWebPage *m_page; QWebPage *m_page;
QNetworkAccessManager* m_pNAM;
QString m_strFile; QString m_strFile;
QString m_strUrl; QString m_strUrl;
QString m_strTable; QString m_strTable;

View File

@@ -136,6 +136,9 @@ QStringList SCrawlerData::GetNumber(QString _str)
{ {
if (pch[i].isNumber() || pch[i].isSpace()) if (pch[i].isNumber() || pch[i].isSpace())
str += pch[i]; str += pch[i];
else if(pch[i] != ',' && pch[i] != '.')
str += ' ';
} }
return str.trimmed().split(" ");
return str.trimmed().split(" ", QString::SkipEmptyParts);
} }

View File

@@ -25,9 +25,13 @@ from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
def is_debugger_attached(): def is_debugger_attached():
try:
for frame in inspect.stack(): for frame in inspect.stack():
if frame[1].endswith("pydevd.py"): if frame[1].endswith("pydevd.py"):
return True return True
except:
return False
return False return False
is_debug = is_debugger_attached() is_debug = is_debugger_attached()

View File

@@ -103,32 +103,27 @@ class Proxy2Handler:
def lock_enter(self): def lock_enter(self):
# logger.log('lock {}'.format(threading.current_thread().ident)) # logger.log('lock {}'.format(threading.current_thread().ident))
# self.lock.acquire() self.lock.acquire()
pass pass
def lock_leave(self): def lock_leave(self):
# self.lock.release() self.lock.release()
# logger.log('unlock {}'.format(threading.current_thread().ident)) # logger.log('unlock {}'.format(threading.current_thread().ident))
pass pass
def commit(self): def commit(self):
self.lock_enter()
# self.session.commit() # self.session.commit()
self.lock_leave() pass
def get_oldest(self, platform): def get_oldest(self, platform):
self.lock_enter()
instance = self.session.query(Proxy2Model).order_by(self.block_field_map[platform].desc()).first() instance = self.session.query(Proxy2Model).order_by(self.block_field_map[platform].desc()).first()
self.lock_leave()
return instance return instance
def get_query(self, ip, port): def get_query(self, ip, port):
return self.session.query(Proxy2Model).filter_by(ip=ip).filter_by(port=port) return self.session.query(Proxy2Model).filter_by(ip=ip).filter_by(port=port)
def get_instance(self, ip, port): def get_instance(self, ip, port):
self.lock_enter()
instance = self.get_query(ip, port).first() instance = self.get_query(ip, port).first()
self.lock_leave()
return instance return instance
def check_all_proxies(self, platform): def check_all_proxies(self, platform):
@@ -161,7 +156,7 @@ class Proxy2Handler:
if resp.ok: if resp.ok:
instance.set_block_at(platform, None) instance.set_block_at(platform, None)
alive_cnt += 1 alive_cnt += 1
print('proxy {}:{} alive'.format(instance.ip, instance.port)) # print('proxy {}:{} alive'.format(instance.ip, instance.port))
else: else:
instance.set_block_at(platform, datetime.datetime.now()) instance.set_block_at(platform, datetime.datetime.now())
@@ -171,34 +166,12 @@ class Proxy2Handler:
def get(self, platform, proc_id=-1): def get(self, platform, proc_id=-1):
self.lock_enter() self.lock_enter()
try:
block_column = self.block_field_map[platform] block_column = self.block_field_map[platform]
try:
instances = self.session.query(Proxy2Model).filter(block_column == None).limit(32).all() instances = self.session.query(Proxy2Model).filter(block_column == None).limit(32).all()
except Exception as e:
dbg.print_exception()
assert True
self.lock_leave()
# try:
# session_factory = sqlalchemy.orm.sessionmaker(bind=self.engine)
# self.session = sqlalchemy.orm.scoped_session(session_factory)
# logger.log('{} session recreate'.format(proc_id))
#
# except Exception as e2:
# dbg.print_exception(e2)
return None
instance = None
if len(instances) > 0:
instance = instances[random.randint(0, len(instances)-1)] if len(instances) > 0 else None instance = instances[random.randint(0, len(instances)-1)] if len(instances) > 0 else None
proxy = None
if instance: if instance:
self.lock_leave() proxy = instance.get_instance_for_http()
return instance.get_instance_for_http()
else: else:
cnt = self.check_all_proxies(platform) cnt = self.check_all_proxies(platform)
if cnt <= 0: if cnt <= 0:
@@ -206,47 +179,27 @@ class Proxy2Handler:
self.insert_all(proxies) self.insert_all(proxies)
self.lock_leave() self.lock_leave()
return self.get(platform, proc_id) return proxy
except Exception as e:
dbg.print_exception(e)
def insert(self, ip, port): def insert(self, ip, port):
instance = self.get_instance(ip, port) instance = self.get_instance(ip, port)
if not instance: if not instance:
proxy = Proxy2Model(ip, port) proxy = Proxy2Model(ip, port)
self.lock_enter()
self.session.add(proxy) self.session.add(proxy)
self.lock_leave() self.commit()
def insert_all(self, proxies): def insert_all(self, proxies):
print('{} proxy insert start'.format(len(proxies)))
# INSERT INTO proxy2(ip, PORT) # INSERT INTO proxy2(ip, PORT)
# SELECT <ip>, <port> FROM DUAL # SELECT <ip>, <port> FROM DUAL
# WHERE NOT EXISTS (SELECT * FROM proxy2 WHERE ip=<ip> AND PORT=<port>) # WHERE NOT EXISTS (SELECT * FROM proxy2 WHERE ip=<ip> AND PORT=<port>)
self.lock.acquire()
for proxy in proxies: for proxy in proxies:
query = r"INSERT INTO proxy2(ip, PORT) " \ query = r"INSERT INTO proxy2(ip, PORT) " \
r"SELECT '{}', {} FROM DUAL " \ r"SELECT '{}', {} FROM DUAL " \
r"WHERE NOT EXISTS (SELECT * FROM proxy2 WHERE ip='{}' AND PORT={})"\ r"WHERE NOT EXISTS (SELECT * FROM proxy2 WHERE ip='{}' AND PORT={})"\
.format(proxy['ip'], proxy['port'], proxy['ip'], proxy['port']) .format(proxy['ip'], proxy['port'], proxy['ip'], proxy['port'])
# 안됨 - 중복으로 들어감, 쓰레드 종료됨
self.engine.execute(query) self.engine.execute(query)
self.lock.release() print('{} proxy insert end'.format(len(proxies)))
# self.query(Proxy2Model).insert()
#
# self.query(Proxy2Model).filter(Proxy2Model.ip == proxy['ip']).filter(Proxy2Model.port == proxy['port']).\
# filter(
# ~sqlalchemy.exists().where(
# sqlalchemy.and_(
# Proxy2Model.kw_id == Proxy2Model.kw_id,
# Proxy2Model.checkpoint_id == Proxy2Model.id
# )
# )
# )
#
# if self.session.query(Proxy2Model).filter_by(ip=proxy['ip']).filter_by(port=proxy['port']).count() == 0:
# self.session.add(Proxy2Model(proxy['ip'], proxy['port']))
def set_proxy_blocked(self, ip, port, platform): def set_proxy_blocked(self, ip, port, platform):
try: try:

View File

@@ -86,13 +86,14 @@ def check_proxy(qu, proxy, url):
def crawl_proxies(check_url=None): def crawl_proxies(check_url=None):
# print('proxy crawling start') print('proxy crawling start')
proxies = get_proxies_free_proxy() proxies = []
proxies += get_proxies_free_proxy()
proxies += get_proxies_proxy_searcher() proxies += get_proxies_proxy_searcher()
# proxies += get_proxies_nntime() # proxies += get_proxies_nntime()
# proxies = list(set(proxies)) # proxies = list(set(proxies))
# print('proxy crawled {}'.format(len(proxies)))
proxies_alive = []
if check_url: if check_url:
qu = queue.Queue() qu = queue.Queue()
threads = [] threads = []
@@ -103,7 +104,6 @@ def crawl_proxies(check_url=None):
[th.start() for th in threads] [th.start() for th in threads]
[th.join() for th in threads] [th.join() for th in threads]
proxies_alive = []
while not qu.empty(): while not qu.empty():
proxy = qu.get() proxy = qu.get()
proxies_alive.append(proxy) proxies_alive.append(proxy)
@@ -111,21 +111,9 @@ def crawl_proxies(check_url=None):
else: else:
proxies_alive = proxies proxies_alive = proxies
# print('proxy crawling end') print('proxy crawled {}'.format(len(proxies_alive)))
return proxies_alive return proxies_alive
# proxies_alive.sort()
# print('proxy crawler got {} proxies'.format(len(proxies_alive)))
#
# with open('proxy.txt', 'w') as f:
# print('proxy crawler dump start')
# for proxy in proxies_alive:
# # print(proxy)
# f.write(proxy + '\n')
# print('proxy crawler dump end')
#
# print('proxy crawling end')
if __name__ == '__main__': if __name__ == '__main__':

View File

@@ -236,7 +236,7 @@ def make_list_instance(url, proxies=None):
return None return None
# @instance_wrapper @instance_wrapper
def make_content_instance(url, proxies=None): def make_content_instance(url, proxies=None):
try: try:
content = InstaContent(url, {}, url, proxies) content = InstaContent(url, {}, url, proxies)
@@ -265,7 +265,7 @@ def ajax_wrapper(func):
return retry_ajax_load return retry_ajax_load
# @ajax_wrapper @ajax_wrapper
def load_ajax_list(ins): def load_ajax_list(ins):
try: try:
insta_list = ins.load_more() insta_list = ins.load_more()
@@ -280,7 +280,7 @@ def load_ajax_list(ins):
return None return None
# @ajax_wrapper @ajax_wrapper
def load_ajax_reply(ins): def load_ajax_reply(ins):
try: try:
replies = ins.load_reply_more() replies = ins.load_reply_more()
@@ -978,8 +978,9 @@ class InstaAlgorithmMulti(InstaAlgorithm):
self.total_num += 1 self.total_num += 1
if self.is_until_page(): if self.is_until_page():
return False return False
# if self.list_crawl: # if self.list_crawl:
# printl("Number of Lists = {0}".format(len(self.list_crawl))) # printl("Number of Lists = {0}".format(self.list_crawl.qsize()))
return True return True
def crawl(self): def crawl(self):

View File

@@ -227,7 +227,7 @@ class TwitterCrawler:
for container_tags in reply_container_tags: for container_tags in reply_container_tags:
tweet_tags = container_tags.select('div.tweet') tweet_tags = container_tags.select('div.tweet')
if len(tweet_tags) > 0: if len(tweet_tags) > 0:
tweet = TweetParser.parse(tweet_tags[0], self.default_config.keyword_id, parent_tw.depth+1, top_tw) tweet = TweetParser.parse(tweet_tags[0], self.default_config.keyword_id, parent_tw.depth+1, parent_tw, top_tw)
# print('[{}]>>> {} {}: {} ({}) ({})'.format(proc_id, tweet.created_at, tweet.user_name, tweet.text[:20], tweet.depth, tweet.tweet_link)) # print('[{}]>>> {} {}: {} ({}) ({})'.format(proc_id, tweet.created_at, tweet.user_name, tweet.text[:20], tweet.depth, tweet.tweet_link))
print('[{}] reply {} [{}]'.format(proc_id, tweet.top_link, 'ok')) print('[{}] reply {} [{}]'.format(proc_id, tweet.top_link, 'ok'))
self.insert_content_pool(proc_id, content_queue, tweet, top_tw) self.insert_content_pool(proc_id, content_queue, tweet, top_tw)

View File

@@ -5,10 +5,11 @@ import bs4
import datetime import datetime
import pytz import pytz
class TweetParser: class TweetParser:
@staticmethod @staticmethod
def parse(tag, keyword_id, depth=0, top_tw: Tweet=None): def parse(tag, keyword_id, depth=0, parent_tw: Tweet=None, top_tw: Tweet=None):
tweet = Tweet() tweet = Tweet()
tweet.tweet_id = int(tag.attrs['data-tweet-id']) tweet.tweet_id = int(tag.attrs['data-tweet-id'])
@@ -62,7 +63,7 @@ class TweetParser:
tweet.platform_form = 'post' tweet.platform_form = 'post'
tweet.platform_title = top_tw.user_id if top_tw else tweet.user_id tweet.platform_title = top_tw.user_id if top_tw else tweet.user_id
tweet.article_form = 'body' if tweet.depth is 0 else 'reply' tweet.article_form = 'body' if tweet.depth is 0 else 'reply'
# tweet.article_parent = None tweet.article_parent = parent_tw.user_name if parent_tw else None
tweet.article_id = tweet.user_id tweet.article_id = tweet.user_id
tweet.article_nickname = tweet.user_name tweet.article_nickname = tweet.user_name
# tweet.article_title = None # tweet.article_title = None

View File

@@ -86,6 +86,8 @@ if __name__ == '__main__':
sys.argv[5] until_page sys.argv[5] until_page
""" """
print("arguments: {}".format(' '.join(sys.argv)))
if len(sys.argv) == 6: if len(sys.argv) == 6:
print_and_flush("Python Crawling Executed") print_and_flush("Python Crawling Executed")
else: else: