From 1496644cc2831962ef12987dfcb7a5b1332aa7b5 Mon Sep 17 00:00:00 2001 From: mjjo Date: Thu, 20 Jul 2017 10:58:13 +0900 Subject: [PATCH] =?UTF-8?q?=EC=A0=91=EC=86=8D=20=EC=8B=A4=ED=8C=A8?= =?UTF-8?q?=ED=95=9C=20=ED=94=84=EB=A1=9D=EC=8B=9C=20=EC=A3=BC=EC=84=9D?= =?UTF-8?q?=EC=B2=98=EB=A6=AC=ED=95=B4=EC=84=9C=20=EB=8B=A4=EC=8B=9C=20?= =?UTF-8?q?=EC=82=AC=EC=9A=A9=ED=95=98=EC=A7=80=20=EC=95=8A=EB=8A=94=20?= =?UTF-8?q?=EA=B8=B0=EB=8A=A5=20=EC=B6=94=EA=B0=80=20=20-=20=ED=94=8C?= =?UTF-8?q?=EB=9E=AB=ED=8F=BC=EB=B3=84=EB=A1=9C=20=EC=B2=98=EB=A6=AC?= =?UTF-8?q?=ED=95=98=EB=8A=94=20=EA=B8=B0=EB=8A=A5=20=ED=95=84=EC=9A=94?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- WebBasedCrawler/base/baseclasses.py | 1 + WebBasedCrawler/base/dbdata.py | 28 ++++++++++++++++++++++++++++ WebBasedCrawler/base/proxy.py | 27 ++++++++++++++++++++++++++- 3 files changed, 55 insertions(+), 1 deletion(-) diff --git a/WebBasedCrawler/base/baseclasses.py b/WebBasedCrawler/base/baseclasses.py index 70fc8a8..8203cc2 100644 --- a/WebBasedCrawler/base/baseclasses.py +++ b/WebBasedCrawler/base/baseclasses.py @@ -32,6 +32,7 @@ def is_debugger_attached(): is_debug = is_debugger_attached() + def printl(*objects, sep=' ', end='\n', file=None, flush=True): if is_debug: cur_frame = inspect.currentframe() diff --git a/WebBasedCrawler/base/dbdata.py b/WebBasedCrawler/base/dbdata.py index d08d986..e36185d 100644 --- a/WebBasedCrawler/base/dbdata.py +++ b/WebBasedCrawler/base/dbdata.py @@ -24,6 +24,34 @@ class DataDBRow: self.reply_url = None self.etc = None + def get_keys(self): + inst = DataDBRow() + keys = () + for key, value_type in inst.__dict__.items(): + if key.startswith('__') or callable(value_type): + continue + + keys += key, + + return keys + + def get_values(self, conn, db_num): + inst = DataDBRow() + values = () + for key, value_type in inst.__dict__.items(): + if key.startswith('__') or callable(value_type): + continue + + value = self.__dict__[key] + if isinstance(value, Number): + values += str(value), + elif isinstance(value, str): + values += conn.escape(value.encode('utf8').decode('utf8')), + else: + values += conn.escape(value), + + return values + def get_insert_query(self, conn, db_num): inst = DataDBRow() diff --git a/WebBasedCrawler/base/proxy.py b/WebBasedCrawler/base/proxy.py index e3f5d6c..a36367c 100644 --- a/WebBasedCrawler/base/proxy.py +++ b/WebBasedCrawler/base/proxy.py @@ -97,6 +97,31 @@ def get_driver(platform, proxies): else: return platform_webdriver[platform](capabilities=desired_capabilities) +_expired_proxies = [] + + +def set_proxy_expired(proxy): + if proxy not in _expired_proxies: + _expired_proxies.append(proxy) + + address = proxy['http'][len('http://'):] + + with open(proxy_filename, 'r') as f: + lines = f.readlines() + + expired_idx = -1 + for idx, line in enumerate(lines): + if line.startswith(address): + expired_idx = idx + break + + if expired_idx >= 0: + lines[expired_idx] = '# ' + lines[expired_idx] + lines.append(lines.pop(expired_idx)) + + with open(proxy_filename, 'w') as f: + f.writelines(lines) + def get_proxy_from_file(filename): """ @@ -104,7 +129,7 @@ def get_proxy_from_file(filename): :return (ip, port): string, string if ip, port or filename is invalid, return (None, None) """ - proxy_lists = [line.replace('\n', '') for line in open(filename) if re_ip.search(line)] + proxy_lists = [line.replace('\n', '') for line in open(filename) if not line.strip().startswith('#') and re_ip.search(line)] if proxy_lists: m = re_ip.search(proxy_lists[random.randint(0, len(proxy_lists) - 1)]) if m: