import base.proxy_crawler as proxy_crawler import base.logger as logger import sqlalchemy import sqlalchemy.ext import sqlalchemy.ext.declarative import sqlalchemy.orm import enum import datetime import threading import random import requests Base = sqlalchemy.ext.declarative.declarative_base() class Platform(enum.Enum): NAVER = 'naver' DAUM = 'daum' FACEBOOK = 'facebook' KAKAO = 'kakao' INSTA = 'insta' TWITTER = 'twitter' YOUTUBE = 'youtube' class Proxy2Model(Base): __tablename__ = 'proxy2' id = sqlalchemy.Column(sqlalchemy.Integer, primary_key=True, nullable=False, autoincrement=True) ip = sqlalchemy.Column(sqlalchemy.String(15), primary_key=True) port = sqlalchemy.Column(sqlalchemy.SmallInteger, primary_key=True) create_at = sqlalchemy.Column(sqlalchemy.DateTime, default=datetime.datetime.now) naver_block_at = sqlalchemy.Column(sqlalchemy.DateTime) daum_block_at = sqlalchemy.Column(sqlalchemy.DateTime) facebook_block_at = sqlalchemy.Column(sqlalchemy.DateTime) kakao_block_at = sqlalchemy.Column(sqlalchemy.DateTime) insta_block_at = sqlalchemy.Column(sqlalchemy.DateTime) twitter_block_at = sqlalchemy.Column(sqlalchemy.DateTime) youtube_block_at = sqlalchemy.Column(sqlalchemy.DateTime) dead = sqlalchemy.Column(sqlalchemy.Boolean, default=False) def __init__(self, ip, port): self.ip = ip self.port = port self.block_map = { Platform.NAVER: self.naver_block_at, Platform.DAUM: self.daum_block_at, Platform.FACEBOOK: self.facebook_block_at, Platform.KAKAO: self.kakao_block_at, Platform.INSTA: self.insta_block_at, Platform.TWITTER: self.twitter_block_at, Platform.YOUTUBE: self.youtube_block_at, } def __repr__(self): return '{}:{} (twitter:{})'.format(self.ip, self.port, self.twitter_block_at) def get_instance_for_http(self): return { 'http': '{}:{}'.format(self.ip, self.port), 'https': '{}:{}'.format(self.ip, self.port), 'ip': self.ip, 'port': self.port, } def set_block_at(self, platform, value): if platform == Platform.NAVER: self.naver_block_at = value elif platform == Platform.DAUM: self.daum_block_at = value elif platform == Platform.FACEBOOK: self.facebook_block_at = value elif platform == Platform.KAKAO: self.kakao_block_at = value elif platform == Platform.INSTA: self.insta_block_at = value elif platform == Platform.TWITTER: self.twitter_block_at = value elif platform == Platform.YOUTUBE: self.youtube_block_at = value class Proxy2Handler: block_field_map = { Platform.NAVER: Proxy2Model.naver_block_at, Platform.DAUM: Proxy2Model.daum_block_at, Platform.FACEBOOK: Proxy2Model.facebook_block_at, Platform.KAKAO: Proxy2Model.kakao_block_at, Platform.INSTA: Proxy2Model.insta_block_at, Platform.TWITTER: Proxy2Model.twitter_block_at, Platform.YOUTUBE: Proxy2Model.youtube_block_at, } def __init__(self): self.lock = threading.Lock() self.engine = sqlalchemy.create_engine('mysql+pymysql://admin:admin123@bigbird.iptime.org/concepters?charset=utf8') session_factory = sqlalchemy.orm.sessionmaker(bind=self.engine, autocommit=True, autoflush=True) self.session = sqlalchemy.orm.scoped_session(session_factory) pass def lock_enter(self): # logger.log('lock {}'.format(threading.current_thread().ident)) # self.lock.acquire() pass def lock_leave(self): # self.lock.release() # logger.log('unlock {}'.format(threading.current_thread().ident)) pass def commit(self): self.lock_enter() self.session.commit() self.lock_leave() def get_oldest(self, platform): self.lock_enter() instance = self.session.query(Proxy2Model).order_by(self.block_field_map[platform].desc()).first() self.lock_leave() return instance def get_query(self, ip, port): return self.session.query(Proxy2Model).filter_by(ip=ip).filter_by(port=port) def get_instance(self, ip, port): self.lock_enter() instance = self.get_query(ip, port).first() self.lock_leave() return instance def check_all_proxies(self, platform): print('check all start') url_map = { Platform.NAVER: 'https://www.naver.com', Platform.DAUM: 'https://www.daum.net', Platform.FACEBOOK: 'https://www.facebook.com', Platform.KAKAO: 'https://story.kakao.com', Platform.INSTA: 'https://www.instagram.com', Platform.TWITTER: 'https://twitter.com', Platform.YOUTUBE: 'https://www.youtube.com', } block_column = self.block_field_map[platform] instances = self.session.query(Proxy2Model).filter(block_column != None).filter_by(dead=False).order_by(block_column).limit(8).all() alive_cnt = 0 for instance in instances: proxy = instance.get_instance_for_http() try: resp = requests.get(url_map[platform], proxies=proxy, timeout=1) except requests.exceptions.ProxyError as e: instance.dead = True except (requests.exceptions.ConnectTimeout, requests.exceptions.SSLError, requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError) as e: instance.set_block_at(platform, datetime.datetime.now()) except Exception as e: instance.set_block_at(platform, datetime.datetime.now()) else: if resp.ok: instance.set_block_at(platform, None) alive_cnt += 1 print('proxy {}:{} alive'.format(instance.ip, instance.port)) else: instance.set_block_at(platform, datetime.datetime.now()) print('check all end') return alive_cnt def get(self, platform, proc_id=-1): self.lock_enter() block_column = self.block_field_map[platform] try: instances = self.session.query(Proxy2Model).filter(block_column == None).all() except Exception as e: self.lock_leave() try: session_factory = sqlalchemy.orm.sessionmaker(bind=self.engine) self.session = sqlalchemy.orm.scoped_session(session_factory) except Exception as e2: logger.log('{} session recreate'.format(proc_id)) return None instance = instances[random.randint(0, len(instances)-1)] if len(instances) > 0 else None if instance: self.lock_leave() return instance.get_instance_for_http() else: cnt = self.check_all_proxies(platform) if cnt <= 0: proxies = proxy_crawler.crawl_proxies() self.insert_all(proxies) self.lock_leave() return self.get(platform, proc_id) def insert(self, ip, port): instance = self.get_instance(ip, port) if not instance: proxy = Proxy2Model(ip, port) self.lock_enter() self.session.add(proxy) self.lock_leave() def insert_all(self, proxies): # INSERT INTO proxy2(ip, PORT) # SELECT , FROM DUAL # WHERE NOT EXISTS (SELECT * FROM proxy2 WHERE ip= AND PORT=) self.lock.acquire() for proxy in proxies: query = r"INSERT INTO proxy2(ip, PORT) " \ r"SELECT '{}', {} FROM DUAL " \ r"WHERE NOT EXISTS (SELECT * FROM proxy2 WHERE ip='{}' AND PORT={})"\ .format(proxy['ip'], proxy['port'], proxy['ip'], proxy['port']) # 안됨 - 중복으로 들어감, 쓰레드 종료됨 self.engine.execute(query) self.lock.release() # self.query(Proxy2Model).insert() # # self.query(Proxy2Model).filter(Proxy2Model.ip == proxy['ip']).filter(Proxy2Model.port == proxy['port']).\ # filter( # ~sqlalchemy.exists().where( # sqlalchemy.and_( # Proxy2Model.kw_id == Proxy2Model.kw_id, # Proxy2Model.checkpoint_id == Proxy2Model.id # ) # ) # ) # # if self.session.query(Proxy2Model).filter_by(ip=proxy['ip']).filter_by(port=proxy['port']).count() == 0: # self.session.add(Proxy2Model(proxy['ip'], proxy['port'])) def set_proxy_blocked(self, ip, port, platform): block_column = self.block_field_map[platform] query = self.get_query(ip, port) query.update({block_column: datetime.datetime.now()}) self.commit() if __name__ == '__main__': proxy_handler = Proxy2Handler() # proxy_handler.insert('127.0.0.5', 80) # proxy_handler.commit() # proxy_handler.set_proxy_blocked('127.0.0.3', 80, Platform.TWITTER) # instance = proxy_handler.get(Platform.TWITTER) # instance = proxy_handler.get_oldest(Platform.TWITTER) # print(instance) proxy = proxy_handler.get(Platform.TWITTER)