diff --git a/WebBasedCrawler/base/baseclasses.py b/WebBasedCrawler/base/baseclasses.py index 70fc8a8..8203cc2 100644 --- a/WebBasedCrawler/base/baseclasses.py +++ b/WebBasedCrawler/base/baseclasses.py @@ -32,6 +32,7 @@ def is_debugger_attached(): is_debug = is_debugger_attached() + def printl(*objects, sep=' ', end='\n', file=None, flush=True): if is_debug: cur_frame = inspect.currentframe() diff --git a/WebBasedCrawler/base/dbdata.py b/WebBasedCrawler/base/dbdata.py index d08d986..e36185d 100644 --- a/WebBasedCrawler/base/dbdata.py +++ b/WebBasedCrawler/base/dbdata.py @@ -24,6 +24,34 @@ class DataDBRow: self.reply_url = None self.etc = None + def get_keys(self): + inst = DataDBRow() + keys = () + for key, value_type in inst.__dict__.items(): + if key.startswith('__') or callable(value_type): + continue + + keys += key, + + return keys + + def get_values(self, conn, db_num): + inst = DataDBRow() + values = () + for key, value_type in inst.__dict__.items(): + if key.startswith('__') or callable(value_type): + continue + + value = self.__dict__[key] + if isinstance(value, Number): + values += str(value), + elif isinstance(value, str): + values += conn.escape(value.encode('utf8').decode('utf8')), + else: + values += conn.escape(value), + + return values + def get_insert_query(self, conn, db_num): inst = DataDBRow() diff --git a/WebBasedCrawler/base/proxy.py b/WebBasedCrawler/base/proxy.py index e3f5d6c..a36367c 100644 --- a/WebBasedCrawler/base/proxy.py +++ b/WebBasedCrawler/base/proxy.py @@ -97,6 +97,31 @@ def get_driver(platform, proxies): else: return platform_webdriver[platform](capabilities=desired_capabilities) +_expired_proxies = [] + + +def set_proxy_expired(proxy): + if proxy not in _expired_proxies: + _expired_proxies.append(proxy) + + address = proxy['http'][len('http://'):] + + with open(proxy_filename, 'r') as f: + lines = f.readlines() + + expired_idx = -1 + for idx, line in enumerate(lines): + if line.startswith(address): + expired_idx = idx + break + + if expired_idx >= 0: + lines[expired_idx] = '# ' + lines[expired_idx] + lines.append(lines.pop(expired_idx)) + + with open(proxy_filename, 'w') as f: + f.writelines(lines) + def get_proxy_from_file(filename): """ @@ -104,7 +129,7 @@ def get_proxy_from_file(filename): :return (ip, port): string, string if ip, port or filename is invalid, return (None, None) """ - proxy_lists = [line.replace('\n', '') for line in open(filename) if re_ip.search(line)] + proxy_lists = [line.replace('\n', '') for line in open(filename) if not line.strip().startswith('#') and re_ip.search(line)] if proxy_lists: m = re_ip.search(proxy_lists[random.randint(0, len(proxy_lists) - 1)]) if m: