Files
clients/WebBasedCrawler/crawlerstatistic.py
admin fb14e0dbe2 crawlerstatistic
크롤러 통계 보기 프로그램


git-svn-id: svn://192.168.0.12/source@291 8346c931-da38-4b9b-9d4c-e48b93cbd075
2016-09-06 02:44:52 +00:00

1 line
8.9 KiB
Python

import re
import datetime
import os
import argparse
import collections
re_ok = re.compile('^\[([\d]{2}:[\d]{2}:[\d]{2})\] o ')
re_day = re.compile('^\[([\d]{2}:[\d]{2}:[\d]{2})\]')
str_naver_blog = "where=post"
str_naver_cafe = "where=article"
str_daum_cafe = "search.daum.net"
re_log_file = re.compile("([\d]{4}-[\d]{2}-[\d]{2})_[\d]+\\.log$")
re_log_file_mon = re.compile("([\d]{4}-[\d]{2})-[\d]{2}_[\d]+\\.log$")
def get_log_type(log_file_name):
with open(log_file_name) as f:
for line in f:
if line.find(str_naver_blog) != -1:
return "naverblog"
elif line.find(str_naver_cafe) != -1:
return "navercafe"
elif line.find(str_daum_cafe) != -1:
return "daumcafe"
return 'None'
def crawler_log_analysis(file_log_name):
match_count = 0
b_first = True
start_time = ""
end_time = ""
with open(file_log_name) as f:
for line in f:
m = re_day.search(line)
if m:
if b_first:
b_first = False
start_time = m.group(1)
end_time = m.group(1)
m = re_ok.search(line)
if m:
match_count += 1
print("start time = " + start_time)
print("end time = " + end_time)
print("# of success crawler = " + str(match_count))
def anlayze_crawler_log_file(log_file_name):
match_count = 0
b_first = True
start_time = ""
end_time = ""
with open(log_file_name) as f:
try:
for line in f:
m = re_day.search(line)
if m:
if b_first:
b_first = False
start_time = m.group(1)
end_time = m.group(1)
m = re_ok.search(line)
if m:
match_count += 1
except Exception as e:
print("filename =", log_file_name)
print("error occur: ", e)
elap_sec = elapsed_second(start_time, end_time)
return elap_sec, match_count
def time_to_second(val_time):
if type(val_time) == datetime.time:
return val_time.hour * 3600 + val_time.minute * 60 + val_time.second
else:
raise TypeError
def str_to_time(str_time):
return datetime.datetime.strptime(str_time, "%H:%M:%S").time()
def elapsed_second(str_time1, str_time2):
sec1 = time_to_second(str_to_time(str_time1))
sec2 = time_to_second(str_to_time(str_time2))
result = sec2 - sec1
if result < 0:
return -result
else:
return result
def diff_ip(input_file1, input_file2, output_file):
f1 = open(input_file1)
f2 = open(input_file2)
f3 = open(output_file, "w")
l1 = set()
l2 = set()
for line in f1:
l1.add(line)
for line in f2:
l2.add(line)
f1.close()
f2.close()
result = l1 - l2
if not result:
result = l2 - l1
for i in result:
f3.write(i)
f3.close()
def get_log_files(dir_path, str_date=None):
f = []
for (path, dir, files) in os.walk(dir_path):
for filename in files:
if not str_date:
m = re_log_file.match(filename)
if m:
f.append(os.path.join(path, filename))
else:
if (str_date in filename) and (filename[-4:] == ".log"):
f.append(os.path.join(path, filename))
return f
def analyze_crawler_log_dir(dir_path, str_date=None):
log_files = get_log_files(dir_path, str_date)
total_result = {k: [0, 0] for k in ["naverblog", "navercafe", "daumcafe", 'None']}
for log_file in log_files:
result = anlayze_crawler_log_file(log_file)
log_type = get_log_type(log_file)
total_result[log_type][0] += result[0]
total_result[log_type][1] += result[1]
return total_result
def analyze_crawler_log_dir_day(dir_path, mon=False):
log_files = get_log_files(dir_path)
file_prefix = set()
for log_file in log_files:
if mon:
m = re_log_file_mon.search(log_file)
else:
m = re_log_file.search(log_file)
if m:
file_prefix.add(m.group(1))
total_result = {k: {v: [0, 0] for v in ["naverblog", "navercafe", "daumcafe", 'None']} for k in file_prefix}
for log_file in log_files:
result = anlayze_crawler_log_file(log_file)
log_type = get_log_type(log_file)
if mon:
m = re_log_file_mon.search(log_file)
else:
m = re_log_file.search(log_file)
if m:
total_result[m.group(1)][log_type][0] += result[0]
total_result[m.group(1)][log_type][1] += result[1]
return total_result
def print_analyzer_dir_day(dir_path, mon=False):
total_result = analyze_crawler_log_dir_day(dir_path, mon)
od = collections.OrderedDict(sorted(total_result.items()))
for k, v in od.items():
for x, y in v.items():
if y[0]:
print(k, ':', end=' ')
print("{0:<10}".format(str(x)), end=' ')
print(":", end=' ')
print("{0:>11}".format(y[0]), end=' ')
print("seconds,", end=' ')
print("{0:>7}".format(y[1]), end=' ')
print("contents,", end=' ')
if y[1]:
print("{0:>7.2f}".format(y[0]/y[1]), end=' ')
print("seconds/content")
else:
print()
def print_analyze_dir(dir_path, str_date=None):
total_result = analyze_crawler_log_dir(dir_path, str_date)
od = collections.OrderedDict(sorted(total_result.items()))
for k, v in od.items():
try:
if v[0]:
print("{0:<10}".format(str(k)), end=' ')
print(":", end=' ')
print("{0:>11}".format(v[0]), end=' ')
print("seconds,", end=' ')
print("{0:>7}".format(v[1]), end=' ')
print("contents,", end=' ')
if v[1]:
print("{0:>7.2f}".format(v[0]/v[1]), end=' ')
print("seconds/content")
else:
print()
except Exception as e:
print("error occur :", e)
def print_analyze_file(file_name):
total_result = anlayze_crawler_log_file(file_name)
log_type = get_log_type(file_name)
try:
print("{0:<10}".format(str(log_type)), end=' ')
print(":", end=' ')
print("{0:>11}".format(total_result[0]), end=' ')
print("seconds,", end=' ')
print("{0:>7}".format(total_result[1]), end=' ')
print("contents,", end=' ')
if total_result[1]:
print("{0:>7.2f}".format(total_result[0]/total_result[1]), end=' ')
print("seconds/content")
else:
print()
except Exception as e:
print("error occur :", e)
dir_path = "C:\\data\\crawlerlog\\this"
print_analyzer_dir_day(dir_path)
# #str_date = "2016-01"
# #print_analyze_dir(dir_path, str_date)
# print_analyze_dir(dir_path)
# #file_name = "C:\\data\\crawlerlog\\1\\Crawler.01.20\\2016-04-09_27526.log"
# file_name = "E:\\QtLibrary\\mingw5.4.0.32\\2016-05-25_5656.log"
# print_analyze_file(file_name)
# print_analyze_file("C:\\data\\crawlerlog\\this\\2016-05-25_996.log")
# print_analyze_file("C:\\data\\crawlerlog\\this\\2016-05-25_6936.log")
# print_analyze_file("C:\\data\\crawlerlog\\this\\2016-05-25_7048.log")
# print_analyze_file("C:\\data\\crawlerlog\\this\\2016-05-25_9116.log")
#print(analyze_crawler_log_dir(dir_path))
# file1 = "C:\\data\\solidip\\notnaver.txt"
# file2 = "C:\\data\\solidip\\naver.txt"
# file3 = "C:\\data\\solidip\\result.txt"
# diff_ip(file1, file2, file3)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Analyze crawler log file')
parser.add_argument('-f', action='store', type=str, default=None, help='Analyze a log file')
parser.add_argument('-d', action='store', type=str, default=None, help='Analyze all log files in directory')
parser.add_argument('-o', action='store', type=str, default=None, help='Analyze log files containing specified'
'a string with -d option')
parser.add_argument('-m', action='store', type=str, default=None, help='Analyze log files monthly or daily. '
'-m m : monthly, -m d : daily. '
'This ignore -o option')
args = parser.parse_args()
if args.f:
print_analyze_file(args.f)
elif args.d:
if args.m:
if args.m == 'm':
print_analyzer_dir_day(args.d, True)
elif args.d == 'd':
print_analyzer_dir_day(args.d, False)
else:
parser.print_help()
else:
print_analyze_dir(args.d, args.o)
else:
parser.print_help()