diff --git a/WebBasedCrawler/crawlerstatistic.py b/WebBasedCrawler/crawlerstatistic.py new file mode 100644 index 0000000..0df4b1d --- /dev/null +++ b/WebBasedCrawler/crawlerstatistic.py @@ -0,0 +1 @@ +import re import datetime import os import argparse import collections re_ok = re.compile('^\[([\d]{2}:[\d]{2}:[\d]{2})\] o ') re_day = re.compile('^\[([\d]{2}:[\d]{2}:[\d]{2})\]') str_naver_blog = "where=post" str_naver_cafe = "where=article" str_daum_cafe = "search.daum.net" re_log_file = re.compile("([\d]{4}-[\d]{2}-[\d]{2})_[\d]+\\.log$") re_log_file_mon = re.compile("([\d]{4}-[\d]{2})-[\d]{2}_[\d]+\\.log$") def get_log_type(log_file_name): with open(log_file_name) as f: for line in f: if line.find(str_naver_blog) != -1: return "naverblog" elif line.find(str_naver_cafe) != -1: return "navercafe" elif line.find(str_daum_cafe) != -1: return "daumcafe" return 'None' def crawler_log_analysis(file_log_name): match_count = 0 b_first = True start_time = "" end_time = "" with open(file_log_name) as f: for line in f: m = re_day.search(line) if m: if b_first: b_first = False start_time = m.group(1) end_time = m.group(1) m = re_ok.search(line) if m: match_count += 1 print("start time = " + start_time) print("end time = " + end_time) print("# of success crawler = " + str(match_count)) def anlayze_crawler_log_file(log_file_name): match_count = 0 b_first = True start_time = "" end_time = "" with open(log_file_name) as f: try: for line in f: m = re_day.search(line) if m: if b_first: b_first = False start_time = m.group(1) end_time = m.group(1) m = re_ok.search(line) if m: match_count += 1 except Exception as e: print("filename =", log_file_name) print("error occur: ", e) elap_sec = elapsed_second(start_time, end_time) return elap_sec, match_count def time_to_second(val_time): if type(val_time) == datetime.time: return val_time.hour * 3600 + val_time.minute * 60 + val_time.second else: raise TypeError def str_to_time(str_time): return datetime.datetime.strptime(str_time, "%H:%M:%S").time() def elapsed_second(str_time1, str_time2): sec1 = time_to_second(str_to_time(str_time1)) sec2 = time_to_second(str_to_time(str_time2)) result = sec2 - sec1 if result < 0: return -result else: return result def diff_ip(input_file1, input_file2, output_file): f1 = open(input_file1) f2 = open(input_file2) f3 = open(output_file, "w") l1 = set() l2 = set() for line in f1: l1.add(line) for line in f2: l2.add(line) f1.close() f2.close() result = l1 - l2 if not result: result = l2 - l1 for i in result: f3.write(i) f3.close() def get_log_files(dir_path, str_date=None): f = [] for (path, dir, files) in os.walk(dir_path): for filename in files: if not str_date: m = re_log_file.match(filename) if m: f.append(os.path.join(path, filename)) else: if (str_date in filename) and (filename[-4:] == ".log"): f.append(os.path.join(path, filename)) return f def analyze_crawler_log_dir(dir_path, str_date=None): log_files = get_log_files(dir_path, str_date) total_result = {k: [0, 0] for k in ["naverblog", "navercafe", "daumcafe", 'None']} for log_file in log_files: result = anlayze_crawler_log_file(log_file) log_type = get_log_type(log_file) total_result[log_type][0] += result[0] total_result[log_type][1] += result[1] return total_result def analyze_crawler_log_dir_day(dir_path, mon=False): log_files = get_log_files(dir_path) file_prefix = set() for log_file in log_files: if mon: m = re_log_file_mon.search(log_file) else: m = re_log_file.search(log_file) if m: file_prefix.add(m.group(1)) total_result = {k: {v: [0, 0] for v in ["naverblog", "navercafe", "daumcafe", 'None']} for k in file_prefix} for log_file in log_files: result = anlayze_crawler_log_file(log_file) log_type = get_log_type(log_file) if mon: m = re_log_file_mon.search(log_file) else: m = re_log_file.search(log_file) if m: total_result[m.group(1)][log_type][0] += result[0] total_result[m.group(1)][log_type][1] += result[1] return total_result def print_analyzer_dir_day(dir_path, mon=False): total_result = analyze_crawler_log_dir_day(dir_path, mon) od = collections.OrderedDict(sorted(total_result.items())) for k, v in od.items(): for x, y in v.items(): if y[0]: print(k, ':', end=' ') print("{0:<10}".format(str(x)), end=' ') print(":", end=' ') print("{0:>11}".format(y[0]), end=' ') print("seconds,", end=' ') print("{0:>7}".format(y[1]), end=' ') print("contents,", end=' ') if y[1]: print("{0:>7.2f}".format(y[0]/y[1]), end=' ') print("seconds/content") else: print() def print_analyze_dir(dir_path, str_date=None): total_result = analyze_crawler_log_dir(dir_path, str_date) od = collections.OrderedDict(sorted(total_result.items())) for k, v in od.items(): try: if v[0]: print("{0:<10}".format(str(k)), end=' ') print(":", end=' ') print("{0:>11}".format(v[0]), end=' ') print("seconds,", end=' ') print("{0:>7}".format(v[1]), end=' ') print("contents,", end=' ') if v[1]: print("{0:>7.2f}".format(v[0]/v[1]), end=' ') print("seconds/content") else: print() except Exception as e: print("error occur :", e) def print_analyze_file(file_name): total_result = anlayze_crawler_log_file(file_name) log_type = get_log_type(file_name) try: print("{0:<10}".format(str(log_type)), end=' ') print(":", end=' ') print("{0:>11}".format(total_result[0]), end=' ') print("seconds,", end=' ') print("{0:>7}".format(total_result[1]), end=' ') print("contents,", end=' ') if total_result[1]: print("{0:>7.2f}".format(total_result[0]/total_result[1]), end=' ') print("seconds/content") else: print() except Exception as e: print("error occur :", e) dir_path = "C:\\data\\crawlerlog\\this" print_analyzer_dir_day(dir_path) # #str_date = "2016-01" # #print_analyze_dir(dir_path, str_date) # print_analyze_dir(dir_path) # #file_name = "C:\\data\\crawlerlog\\1\\Crawler.01.20\\2016-04-09_27526.log" # file_name = "E:\\QtLibrary\\mingw5.4.0.32\\2016-05-25_5656.log" # print_analyze_file(file_name) # print_analyze_file("C:\\data\\crawlerlog\\this\\2016-05-25_996.log") # print_analyze_file("C:\\data\\crawlerlog\\this\\2016-05-25_6936.log") # print_analyze_file("C:\\data\\crawlerlog\\this\\2016-05-25_7048.log") # print_analyze_file("C:\\data\\crawlerlog\\this\\2016-05-25_9116.log") #print(analyze_crawler_log_dir(dir_path)) # file1 = "C:\\data\\solidip\\notnaver.txt" # file2 = "C:\\data\\solidip\\naver.txt" # file3 = "C:\\data\\solidip\\result.txt" # diff_ip(file1, file2, file3) if __name__ == '__main__': parser = argparse.ArgumentParser(description='Analyze crawler log file') parser.add_argument('-f', action='store', type=str, default=None, help='Analyze a log file') parser.add_argument('-d', action='store', type=str, default=None, help='Analyze all log files in directory') parser.add_argument('-o', action='store', type=str, default=None, help='Analyze log files containing specified' 'a string with -d option') parser.add_argument('-m', action='store', type=str, default=None, help='Analyze log files monthly or daily. ' '-m m : monthly, -m d : daily. ' 'This ignore -o option') args = parser.parse_args() if args.f: print_analyze_file(args.f) elif args.d: if args.m: if args.m == 'm': print_analyzer_dir_day(args.d, True) elif args.d == 'd': print_analyzer_dir_day(args.d, False) else: parser.print_help() else: print_analyze_dir(args.d, args.o) else: parser.print_help() \ No newline at end of file