Files
clients/WebBasedCrawler/effect/InstaUrlValidator.py

86 lines
2.8 KiB
Python

class InstaUrlValidator:
def __init__(self, input_url):
self.protocol = 'https'
self.host = 'www.instagram.com'
self.path1 = 'p'
self.input_user_key = ''
self.input_url = input_url
def preprocess_input_url(self):
if type(self.input_url) != str:
raise TypeError('input url error')
self.preprocessed_input_url = self.input_url.strip()
def check_protocol(self):
start_index = 0
end_index = self.preprocessed_input_url.find(':')
if end_index == -1:
return start_index
if self.preprocessed_input_url[end_index+1] != '/' or self.preprocessed_input_url[end_index+2] != '/':
raise ValueError('incorrect url format')
return end_index + 3
def check_host(self, start_index):
end_index = self.preprocessed_input_url.find('/', start_index)
if end_index == -1:
raise ValueError('incorrect url format')
input_host = self.preprocessed_input_url[start_index:end_index]
if input_host not in self.host:
raise ValueError('incorrect host')
return end_index + 1
def check_path1(self, start_index):
end_index = self.preprocessed_input_url.find('/', start_index)
if end_index == -1:
raise ValueError('incorrect path')
input_path1 = self.preprocessed_input_url[start_index:end_index]
if input_path1 != self.path1:
raise ValueError('incorrect path (/p/)')
return end_index + 1
def check_path2(self, start_index):
end_index = self.preprocessed_input_url.find('/', start_index)
# if end_index == -1:
# raise ValueError('incorrect path')
#
# self.input_user_key = self.preprocessed_input_url[start_index:end_index]
if end_index != -1:
self.input_user_key = self.preprocessed_input_url[start_index:end_index]
else:
self.input_user_key = self.preprocessed_input_url[start_index:]
def make_instagram_url(self):
if len(self.input_user_key) <= 0:
raise ValueError('incorrect user key')
url = self.protocol + '://' + self.host + '/' + self.path1 + '/' + self.input_user_key + '/'
return url
def validate_url(self):
try:
self.preprocess_input_url()
start_index = self.check_protocol()
start_index = self.check_host(start_index)
start_index = self.check_path1(start_index)
self.check_path2(start_index)
except Exception as e:
raise e
def get_insta_url(self):
try:
self.validate_url()
url = self.make_instagram_url()
except Exception as e:
raise e
return url