diff --git a/WebBasedCrawler/crawlerstatistic.py b/WebBasedCrawler/crawlerstatistic.py index f35b71a..6b26135 100644 --- a/WebBasedCrawler/crawlerstatistic.py +++ b/WebBasedCrawler/crawlerstatistic.py @@ -1 +1,279 @@ -import re import datetime import os import argparse import collections # re_ok = re.compile('^\[([\d]{2}:[\d]{2}:[\d]{2})\] o ') re_ok = re.compile('^\[([\d]{2}:[\d]{2}:[\d]{2})\] ok') re_day = re.compile('^\[([\d]{2}:[\d]{2}:[\d]{2})\]') str_naver_blog = "where=post" str_naver_cafe = "where=article" str_daum_cafe = "search.daum.net" str_insta_post = "www.instagram.com/" str_kakao_story = "story.kakao.com/" re_log_file = re.compile("([\d]{4}-[\d]{2}-[\d]{2})_[\d]+\\.log$") re_log_file_mon = re.compile("([\d]{4}-[\d]{2})-[\d]{2}_[\d]+\\.log$") log_type_list = ["naverblog", "navercafe", "daumcafe", 'instagram', 'kakaostory', 'None'] def get_log_type(log_file_name): with open(log_file_name) as f: for line in f: if line.find(str_naver_blog) != -1: return "naverblog" elif line.find(str_naver_cafe) != -1: return "navercafe" elif line.find(str_daum_cafe) != -1: return "daumcafe" elif line.find(str_insta_post) != -1: return "instagram" elif line.find(str_kakao_story) != -1: return "kakaostory" return 'None' def crawler_log_analysis(file_log_name): match_count = 0 b_first = True start_time = "" end_time = "" with open(file_log_name) as f: for line in f: m = re_day.search(line) if m: if b_first: b_first = False start_time = m.group(1) end_time = m.group(1) m = re_ok.search(line) if m: match_count += 1 print("start time = " + start_time) print("end time = " + end_time) print("# of success crawler = " + str(match_count)) def anlayze_crawler_log_file(log_file_name): match_count = 0 b_first = True start_time = "" end_time = "" with open(log_file_name) as f: try: for line in f: m = re_day.search(line) if m: if b_first: b_first = False start_time = m.group(1) end_time = m.group(1) m = re_ok.search(line) if m: match_count += 1 except Exception as e: print("filename =", log_file_name) print("error occur: ", e) elap_sec = elapsed_second(start_time, end_time) return elap_sec, match_count def time_to_second(val_time): if type(val_time) == datetime.time: return val_time.hour * 3600 + val_time.minute * 60 + val_time.second else: raise TypeError def str_to_time(str_time): return datetime.datetime.strptime(str_time, "%H:%M:%S").time() def elapsed_second(str_time1, str_time2): sec1 = time_to_second(str_to_time(str_time1)) sec2 = time_to_second(str_to_time(str_time2)) result = sec2 - sec1 if result < 0: return -result else: return result def diff_ip(input_file1, input_file2, output_file): f1 = open(input_file1) f2 = open(input_file2) f3 = open(output_file, "w") l1 = set() l2 = set() for line in f1: l1.add(line) for line in f2: l2.add(line) f1.close() f2.close() result = l1 - l2 if not result: result = l2 - l1 for i in result: f3.write(i) f3.close() def get_log_files(dir_path, str_date=None): f = [] for (path, dir, files) in os.walk(dir_path): for filename in files: if not str_date: m = re_log_file.match(filename) if m: f.append(os.path.join(path, filename)) else: if (str_date in filename) and (filename[-4:] == ".log"): f.append(os.path.join(path, filename)) return f def analyze_crawler_log_dir(dir_path, str_date=None): log_files = get_log_files(dir_path, str_date) total_result = {k: [0, 0] for k in log_type_list} for log_file in log_files: result = anlayze_crawler_log_file(log_file) log_type = get_log_type(log_file) total_result[log_type][0] += result[0] total_result[log_type][1] += result[1] return total_result def analyze_crawler_log_dir_day(dir_path, mon=False): log_files = get_log_files(dir_path) file_prefix = set() for log_file in log_files: if mon: m = re_log_file_mon.search(log_file) else: m = re_log_file.search(log_file) if m: file_prefix.add(m.group(1)) total_result = {k: {v: [0, 0] for v in log_type_list} for k in file_prefix} for log_file in log_files: result = anlayze_crawler_log_file(log_file) log_type = get_log_type(log_file) if mon: m = re_log_file_mon.search(log_file) else: m = re_log_file.search(log_file) if m: total_result[m.group(1)][log_type][0] += result[0] total_result[m.group(1)][log_type][1] += result[1] return total_result def print_analyzer_dir_day(dir_path, mon=False): total_result = analyze_crawler_log_dir_day(dir_path, mon) od = collections.OrderedDict(sorted(total_result.items())) for k, v in od.items(): for x, y in v.items(): if y[0]: print(k, ':', end=' ') print("{0:<10}".format(str(x)), end=' ') print(":", end=' ') print("{0:>11}".format(y[0]), end=' ') print("seconds,", end=' ') print("{0:>7}".format(y[1]), end=' ') print("contents,", end=' ') if y[1]: print("{0:>7.2f}".format(y[0]/y[1]), end=' ') print("seconds/content") else: print() def print_analyze_dir(dir_path, str_date=None): total_result = analyze_crawler_log_dir(dir_path, str_date) od = collections.OrderedDict(sorted(total_result.items())) for k, v in od.items(): try: if v[0]: print("{0:<10}".format(str(k)), end=' ') print(":", end=' ') print("{0:>11}".format(v[0]), end=' ') print("seconds,", end=' ') print("{0:>7}".format(v[1]), end=' ') print("contents,", end=' ') if v[1]: print("{0:>7.2f}".format(v[0]/v[1]), end=' ') print("seconds/content") else: print() except Exception as e: print("error occur :", e) def print_analyze_file(file_name): total_result = anlayze_crawler_log_file(file_name) log_type = get_log_type(file_name) try: print("{0:<10}".format(str(log_type)), end=' ') print(":", end=' ') print("{0:>11}".format(total_result[0]), end=' ') print("seconds,", end=' ') print("{0:>7}".format(total_result[1]), end=' ') print("contents,", end=' ') if total_result[1]: print("{0:>7.2f}".format(total_result[0]/total_result[1]), end=' ') print("seconds/content") else: print() except Exception as e: print("error occur :", e) dir_path = "C:\\data\\crawlerlog\\this" print_analyzer_dir_day(dir_path) # #str_date = "2016-01" # #print_analyze_dir(dir_path, str_date) # print_analyze_dir(dir_path) # #file_name = "C:\\data\\crawlerlog\\1\\Crawler.01.20\\2016-04-09_27526.log" # file_name = "E:\\QtLibrary\\mingw5.4.0.32\\2016-05-25_5656.log" # print_analyze_file(file_name) # print_analyze_file("C:\\data\\crawlerlog\\this\\2016-05-25_996.log") # print_analyze_file("C:\\data\\crawlerlog\\this\\2016-05-25_6936.log") # print_analyze_file("C:\\data\\crawlerlog\\this\\2016-05-25_7048.log") # print_analyze_file("C:\\data\\crawlerlog\\this\\2016-05-25_9116.log") #print(analyze_crawler_log_dir(dir_path)) # file1 = "C:\\data\\solidip\\notnaver.txt" # file2 = "C:\\data\\solidip\\naver.txt" # file3 = "C:\\data\\solidip\\result.txt" # diff_ip(file1, file2, file3) if __name__ == '__main__': parser = argparse.ArgumentParser(description='Analyze crawler log file') parser.add_argument('-f', action='store', type=str, default=None, help='Analyze a log file') parser.add_argument('-d', action='store', type=str, default=None, help='Analyze all log files in directory') parser.add_argument('-o', action='store', type=str, default=None, help='Analyze log files containing specified' 'a string with -d option') parser.add_argument('-m', action='store', type=str, default=None, help='Analyze log files monthly or daily. ' '-m m : monthly, -m d : daily. ' 'This ignore -o option') args = parser.parse_args() if args.f: print_analyze_file(args.f) elif args.d: if args.m: if args.m == 'm': print_analyzer_dir_day(args.d, True) elif args.d == 'd': print_analyzer_dir_day(args.d, False) else: parser.print_help() else: print_analyze_dir(args.d, args.o) else: parser.print_help() \ No newline at end of file +import re +import datetime +import os +import argparse +import collections + +# re_ok = re.compile('^\[([\d]{2}:[\d]{2}:[\d]{2})\] o ') +re_ok = re.compile('^\[([\d]{2}:[\d]{2}:[\d]{2})\] ok') +re_day = re.compile('^\[([\d]{2}:[\d]{2}:[\d]{2})\]') +str_naver_blog = "where=post" +str_naver_cafe = "where=article" +str_daum_cafe = "search.daum.net" +str_insta_post = "www.instagram.com/" +str_kakao_story = "story.kakao.com/" +str_naver_news = "news.naver.com/" +re_log_file = re.compile("([\d]{4}-[\d]{2}-[\d]{2})_[\d]+\\.log$") +re_log_file_mon = re.compile("([\d]{4}-[\d]{2})-[\d]{2}_[\d]+\\.log$") +log_type_list = ["naverblog", "navercafe", "daumcafe", 'instagram', 'kakaostory', 'navernews', 'None'] + + +def get_log_type(log_file_name): + with open(log_file_name) as f: + for line in f: + if line.find(str_naver_blog) != -1: + return "naverblog" + elif line.find(str_naver_cafe) != -1: + return "navercafe" + elif line.find(str_daum_cafe) != -1: + return "daumcafe" + elif line.find(str_insta_post) != -1: + return "instagram" + elif line.find(str_kakao_story) != -1: + return "kakaostory" + elif line.find(str_naver_news) != -1: + return "navernews" + return 'None' + + +def crawler_log_analysis(file_log_name): + match_count = 0 + b_first = True + start_time = "" + end_time = "" + with open(file_log_name) as f: + for line in f: + m = re_day.search(line) + if m: + if b_first: + b_first = False + start_time = m.group(1) + end_time = m.group(1) + m = re_ok.search(line) + if m: + match_count += 1 + print("start time = " + start_time) + print("end time = " + end_time) + print("# of success crawler = " + str(match_count)) + + +def anlayze_crawler_log_file(log_file_name): + match_count = 0 + b_first = True + start_time = "" + end_time = "" + with open(log_file_name) as f: + try: + for line in f: + m = re_day.search(line) + if m: + if b_first: + b_first = False + start_time = m.group(1) + end_time = m.group(1) + m = re_ok.search(line) + if m: + match_count += 1 + except Exception as e: + print("filename =", log_file_name) + print("error occur: ", e) + elap_sec = elapsed_second(start_time, end_time) + return elap_sec, match_count + + +def time_to_second(val_time): + if type(val_time) == datetime.time: + return val_time.hour * 3600 + val_time.minute * 60 + val_time.second + else: + raise TypeError + + +def str_to_time(str_time): + return datetime.datetime.strptime(str_time, "%H:%M:%S").time() + + +def elapsed_second(str_time1, str_time2): + sec1 = time_to_second(str_to_time(str_time1)) + sec2 = time_to_second(str_to_time(str_time2)) + result = sec2 - sec1 + if result < 0: + return -result + else: + return result + + +def diff_ip(input_file1, input_file2, output_file): + f1 = open(input_file1) + f2 = open(input_file2) + f3 = open(output_file, "w") + l1 = set() + l2 = set() + for line in f1: + l1.add(line) + for line in f2: + l2.add(line) + f1.close() + f2.close() + result = l1 - l2 + if not result: + result = l2 - l1 + for i in result: + f3.write(i) + f3.close() + + +def get_log_files(dir_path, str_date=None): + f = [] + for (path, dir, files) in os.walk(dir_path): + for filename in files: + if not str_date: + m = re_log_file.match(filename) + if m: + f.append(os.path.join(path, filename)) + else: + if (str_date in filename) and (filename[-4:] == ".log"): + f.append(os.path.join(path, filename)) + return f + + +def analyze_crawler_log_dir(dir_path, str_date=None): + log_files = get_log_files(dir_path, str_date) + total_result = {k: [0, 0] for k in log_type_list} + for log_file in log_files: + result = anlayze_crawler_log_file(log_file) + log_type = get_log_type(log_file) + total_result[log_type][0] += result[0] + total_result[log_type][1] += result[1] + return total_result + + +def analyze_crawler_log_dir_day(dir_path, mon=False): + log_files = get_log_files(dir_path) + file_prefix = set() + for log_file in log_files: + if mon: + m = re_log_file_mon.search(log_file) + else: + m = re_log_file.search(log_file) + if m: + file_prefix.add(m.group(1)) + total_result = {k: {v: [0, 0] for v in log_type_list} for k in file_prefix} + for log_file in log_files: + result = anlayze_crawler_log_file(log_file) + log_type = get_log_type(log_file) + if mon: + m = re_log_file_mon.search(log_file) + else: + m = re_log_file.search(log_file) + if m: + total_result[m.group(1)][log_type][0] += result[0] + total_result[m.group(1)][log_type][1] += result[1] + return total_result + + +def print_analyzer_dir_day(dir_path, mon=False): + total_result = analyze_crawler_log_dir_day(dir_path, mon) + od = collections.OrderedDict(sorted(total_result.items())) + for k, v in od.items(): + for x, y in v.items(): + if y[0]: + print(k, ':', end=' ') + print("{0:<10}".format(str(x)), end=' ') + print(":", end=' ') + print("{0:>11}".format(y[0]), end=' ') + print("seconds,", end=' ') + print("{0:>7}".format(y[1]), end=' ') + print("contents,", end=' ') + if y[1]: + print("{0:>7.2f}".format(y[0]/y[1]), end=' ') + print("seconds/content") + else: + print() + + +def print_analyze_dir(dir_path, str_date=None): + total_result = analyze_crawler_log_dir(dir_path, str_date) + od = collections.OrderedDict(sorted(total_result.items())) + for k, v in od.items(): + try: + if v[0]: + print("{0:<10}".format(str(k)), end=' ') + print(":", end=' ') + print("{0:>11}".format(v[0]), end=' ') + print("seconds,", end=' ') + print("{0:>7}".format(v[1]), end=' ') + print("contents,", end=' ') + if v[1]: + print("{0:>7.2f}".format(v[0]/v[1]), end=' ') + print("seconds/content") + else: + print() + except Exception as e: + print("error occur :", e) + + +def print_analyze_file(file_name): + total_result = anlayze_crawler_log_file(file_name) + log_type = get_log_type(file_name) + try: + print("{0:<10}".format(str(log_type)), end=' ') + print(":", end=' ') + print("{0:>11}".format(total_result[0]), end=' ') + print("seconds,", end=' ') + print("{0:>7}".format(total_result[1]), end=' ') + print("contents,", end=' ') + if total_result[1]: + print("{0:>7.2f}".format(total_result[0]/total_result[1]), end=' ') + print("seconds/content") + else: + print() + except Exception as e: + print("error occur :", e) + + + +dir_path = "C:\\data\\crawlerlog\\this" +print_analyzer_dir_day(dir_path) +# #str_date = "2016-01" +# #print_analyze_dir(dir_path, str_date) +# print_analyze_dir(dir_path) +# #file_name = "C:\\data\\crawlerlog\\1\\Crawler.01.20\\2016-04-09_27526.log" +# file_name = "E:\\QtLibrary\\mingw5.4.0.32\\2016-05-25_5656.log" +# print_analyze_file(file_name) +# print_analyze_file("C:\\data\\crawlerlog\\this\\2016-05-25_996.log") +# print_analyze_file("C:\\data\\crawlerlog\\this\\2016-05-25_6936.log") +# print_analyze_file("C:\\data\\crawlerlog\\this\\2016-05-25_7048.log") +# print_analyze_file("C:\\data\\crawlerlog\\this\\2016-05-25_9116.log") + +#print(analyze_crawler_log_dir(dir_path)) +# file1 = "C:\\data\\solidip\\notnaver.txt" +# file2 = "C:\\data\\solidip\\naver.txt" +# file3 = "C:\\data\\solidip\\result.txt" +# diff_ip(file1, file2, file3) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Analyze crawler log file') + parser.add_argument('-f', action='store', type=str, default=None, help='Analyze a log file') + parser.add_argument('-d', action='store', type=str, default=None, help='Analyze all log files in directory') + parser.add_argument('-o', action='store', type=str, default=None, help='Analyze log files containing specified' + 'a string with -d option') + parser.add_argument('-m', action='store', type=str, default=None, help='Analyze log files monthly or daily. ' + '-m m : monthly, -m d : daily. ' + 'This ignore -o option') + args = parser.parse_args() + if args.f: + print_analyze_file(args.f) + elif args.d: + if args.m: + if args.m == 'm': + print_analyzer_dir_day(args.d, True) + elif args.d == 'd': + print_analyzer_dir_day(args.d, False) + else: + parser.print_help() + else: + print_analyze_dir(args.d, args.o) + else: + parser.print_help() +