Files
clients/WebBasedCrawler/base/proxy2.py
2017-08-11 12:25:53 +09:00

225 lines
7.0 KiB
Python

import base.proxy_crawler as proxy_crawler
import base.logger as logger
import sqlalchemy
import sqlalchemy.ext
import sqlalchemy.ext.declarative
import sqlalchemy.orm
import enum
import datetime
import threading
import random
import requests
import base.debug as dbg
Base = sqlalchemy.ext.declarative.declarative_base()
class Platform(enum.Enum):
NAVER = 'naver'
DAUM = 'daum'
FACEBOOK = 'facebook'
KAKAO = 'kakao'
INSTA = 'insta'
TWITTER = 'twitter'
YOUTUBE = 'youtube'
class Proxy2Model(Base):
__tablename__ = 'proxy2'
id = sqlalchemy.Column(sqlalchemy.Integer, primary_key=True, nullable=False, autoincrement=True)
ip = sqlalchemy.Column(sqlalchemy.String(15), primary_key=True)
port = sqlalchemy.Column(sqlalchemy.SmallInteger, primary_key=True)
create_at = sqlalchemy.Column(sqlalchemy.DateTime, default=datetime.datetime.now)
naver_block_at = sqlalchemy.Column(sqlalchemy.DateTime)
daum_block_at = sqlalchemy.Column(sqlalchemy.DateTime)
facebook_block_at = sqlalchemy.Column(sqlalchemy.DateTime)
kakao_block_at = sqlalchemy.Column(sqlalchemy.DateTime)
insta_block_at = sqlalchemy.Column(sqlalchemy.DateTime)
twitter_block_at = sqlalchemy.Column(sqlalchemy.DateTime)
youtube_block_at = sqlalchemy.Column(sqlalchemy.DateTime)
dead = sqlalchemy.Column(sqlalchemy.Boolean, default=False)
def __init__(self, ip, port):
self.ip = ip
self.port = port
self.block_map = {
Platform.NAVER: self.naver_block_at,
Platform.DAUM: self.daum_block_at,
Platform.FACEBOOK: self.facebook_block_at,
Platform.KAKAO: self.kakao_block_at,
Platform.INSTA: self.insta_block_at,
Platform.TWITTER: self.twitter_block_at,
Platform.YOUTUBE: self.youtube_block_at,
}
def __repr__(self):
return '{}:{} (twitter:{})'.format(self.ip, self.port, self.twitter_block_at)
def get_instance_for_http(self):
return {
'http': '{}:{}'.format(self.ip, self.port),
'https': '{}:{}'.format(self.ip, self.port),
'ip': self.ip,
'port': self.port,
}
def set_block_at(self, platform, value):
if platform == Platform.NAVER:
self.naver_block_at = value
elif platform == Platform.DAUM:
self.daum_block_at = value
elif platform == Platform.FACEBOOK:
self.facebook_block_at = value
elif platform == Platform.KAKAO:
self.kakao_block_at = value
elif platform == Platform.INSTA:
self.insta_block_at = value
elif platform == Platform.TWITTER:
self.twitter_block_at = value
elif platform == Platform.YOUTUBE:
self.youtube_block_at = value
class Proxy2Handler:
block_field_map = {
Platform.NAVER: Proxy2Model.naver_block_at,
Platform.DAUM: Proxy2Model.daum_block_at,
Platform.FACEBOOK: Proxy2Model.facebook_block_at,
Platform.KAKAO: Proxy2Model.kakao_block_at,
Platform.INSTA: Proxy2Model.insta_block_at,
Platform.TWITTER: Proxy2Model.twitter_block_at,
Platform.YOUTUBE: Proxy2Model.youtube_block_at,
}
def __init__(self):
self.lock = threading.Lock()
self.engine = sqlalchemy.create_engine('mysql+pymysql://admin:admin123@bigbird.iptime.org/concepters?charset=utf8')
SessionFactory = sqlalchemy.orm.sessionmaker(bind=self.engine, autocommit=True, autoflush=True)
self.session = sqlalchemy.orm.scoped_session(SessionFactory)
def lock_enter(self):
# logger.log('lock {}'.format(threading.current_thread().ident))
self.lock.acquire()
pass
def lock_leave(self):
self.lock.release()
# logger.log('unlock {}'.format(threading.current_thread().ident))
pass
def commit(self):
# self.session.commit()
pass
def get_oldest(self, platform):
instance = self.session.query(Proxy2Model).order_by(self.block_field_map[platform].desc()).first()
return instance
def get_query(self, ip, port):
return self.session.query(Proxy2Model).filter_by(ip=ip).filter_by(port=port)
def get_instance(self, ip, port):
instance = self.get_query(ip, port).first()
return instance
def check_all_proxies(self, platform):
# print('check all start')
url_map = {
Platform.NAVER: 'https://www.naver.com',
Platform.DAUM: 'https://www.daum.net',
Platform.FACEBOOK: 'https://www.facebook.com',
Platform.KAKAO: 'https://story.kakao.com',
Platform.INSTA: 'https://www.instagram.com',
Platform.TWITTER: 'https://twitter.com',
Platform.YOUTUBE: 'https://www.youtube.com',
}
block_column = self.block_field_map[platform]
instances = self.session.query(Proxy2Model).filter(block_column != None).filter_by(dead=False).order_by(block_column).limit(8).all()
alive_cnt = 0
for instance in instances:
proxy = instance.get_instance_for_http()
try:
resp = requests.get(url_map[platform], proxies=proxy, timeout=1)
except requests.exceptions.ProxyError as e:
instance.dead = True
except (requests.exceptions.ConnectTimeout, requests.exceptions.SSLError, requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError) as e:
instance.set_block_at(platform, datetime.datetime.now())
except Exception as e:
instance.set_block_at(platform, datetime.datetime.now())
else:
if resp.ok:
instance.set_block_at(platform, None)
alive_cnt += 1
# print('proxy {}:{} alive'.format(instance.ip, instance.port))
else:
instance.set_block_at(platform, datetime.datetime.now())
# print('check all end')
return alive_cnt
def get(self, platform, proc_id=-1):
self.lock_enter()
block_column = self.block_field_map[platform]
instances = self.session.query(Proxy2Model).filter(block_column == None).limit(32).all()
instance = instances[random.randint(0, len(instances)-1)] if len(instances) > 0 else None
proxy = None
if instance:
proxy = instance.get_instance_for_http()
else:
cnt = self.check_all_proxies(platform)
if cnt <= 0:
proxies = proxy_crawler.crawl_proxies()
self.insert_all(proxies)
self.lock_leave()
return proxy
def insert(self, ip, port):
instance = self.get_instance(ip, port)
if not instance:
proxy = Proxy2Model(ip, port)
self.session.add(proxy)
self.commit()
def insert_all(self, proxies):
print('{} proxy insert start'.format(len(proxies)))
# INSERT INTO proxy2(ip, PORT)
# SELECT <ip>, <port> FROM DUAL
# WHERE NOT EXISTS (SELECT * FROM proxy2 WHERE ip=<ip> AND PORT=<port>)
for proxy in proxies:
query = r"INSERT INTO proxy2(ip, PORT) " \
r"SELECT '{}', {} FROM DUAL " \
r"WHERE NOT EXISTS (SELECT * FROM proxy2 WHERE ip='{}' AND PORT={})"\
.format(proxy['ip'], proxy['port'], proxy['ip'], proxy['port'])
self.engine.execute(query)
print('{} proxy insert end'.format(len(proxies)))
def set_proxy_blocked(self, ip, port, platform):
try:
block_column = self.block_field_map[platform]
query = self.get_query(ip, port)
query.update({block_column: datetime.datetime.now()})
self.commit()
except Exception as e:
dbg.print_exception(e)
if __name__ == '__main__':
proxy_handler = Proxy2Handler()
# proxy_handler.insert('127.0.0.5', 80)
# proxy_handler.commit()
# proxy_handler.set_proxy_blocked('127.0.0.3', 80, Platform.TWITTER)
# instance = proxy_handler.get(Platform.TWITTER)
# instance = proxy_handler.get_oldest(Platform.TWITTER)
# print(instance)
proxy = proxy_handler.get(Platform.TWITTER)