280 lines
9.3 KiB
Python
280 lines
9.3 KiB
Python
import re
|
|
import datetime
|
|
import os
|
|
import argparse
|
|
import collections
|
|
|
|
# re_ok = re.compile('^\[([\d]{2}:[\d]{2}:[\d]{2})\] o ')
|
|
re_ok = re.compile('^\[([\d]{2}:[\d]{2}:[\d]{2})\] ok')
|
|
re_day = re.compile('^\[([\d]{2}:[\d]{2}:[\d]{2})\]')
|
|
str_naver_blog = "where=post"
|
|
str_naver_cafe = "where=article"
|
|
str_daum_cafe = "search.daum.net"
|
|
str_insta_post = "www.instagram.com/"
|
|
str_kakao_story = "story.kakao.com/"
|
|
str_naver_news = "news.naver.com/"
|
|
re_log_file = re.compile("([\d]{4}-[\d]{2}-[\d]{2})_[\d]+\\.log$")
|
|
re_log_file_mon = re.compile("([\d]{4}-[\d]{2})-[\d]{2}_[\d]+\\.log$")
|
|
log_type_list = ["naverblog", "navercafe", "daumcafe", 'instagram', 'kakaostory', 'navernews', 'None']
|
|
|
|
|
|
def get_log_type(log_file_name):
|
|
with open(log_file_name) as f:
|
|
for line in f:
|
|
if line.find(str_naver_blog) != -1:
|
|
return "naverblog"
|
|
elif line.find(str_naver_cafe) != -1:
|
|
return "navercafe"
|
|
elif line.find(str_daum_cafe) != -1:
|
|
return "daumcafe"
|
|
elif line.find(str_insta_post) != -1:
|
|
return "instagram"
|
|
elif line.find(str_kakao_story) != -1:
|
|
return "kakaostory"
|
|
elif line.find(str_naver_news) != -1:
|
|
return "navernews"
|
|
return 'None'
|
|
|
|
|
|
def crawler_log_analysis(file_log_name):
|
|
match_count = 0
|
|
b_first = True
|
|
start_time = ""
|
|
end_time = ""
|
|
with open(file_log_name) as f:
|
|
for line in f:
|
|
m = re_day.search(line)
|
|
if m:
|
|
if b_first:
|
|
b_first = False
|
|
start_time = m.group(1)
|
|
end_time = m.group(1)
|
|
m = re_ok.search(line)
|
|
if m:
|
|
match_count += 1
|
|
print("start time = " + start_time)
|
|
print("end time = " + end_time)
|
|
print("# of success crawler = " + str(match_count))
|
|
|
|
|
|
def anlayze_crawler_log_file(log_file_name):
|
|
match_count = 0
|
|
b_first = True
|
|
start_time = ""
|
|
end_time = ""
|
|
with open(log_file_name) as f:
|
|
try:
|
|
for line in f:
|
|
m = re_day.search(line)
|
|
if m:
|
|
if b_first:
|
|
b_first = False
|
|
start_time = m.group(1)
|
|
end_time = m.group(1)
|
|
m = re_ok.search(line)
|
|
if m:
|
|
match_count += 1
|
|
except Exception as e:
|
|
print("filename =", log_file_name)
|
|
print("error occur: ", e)
|
|
elap_sec = elapsed_second(start_time, end_time)
|
|
return elap_sec, match_count
|
|
|
|
|
|
def time_to_second(val_time):
|
|
if type(val_time) == datetime.time:
|
|
return val_time.hour * 3600 + val_time.minute * 60 + val_time.second
|
|
else:
|
|
raise TypeError
|
|
|
|
|
|
def str_to_time(str_time):
|
|
return datetime.datetime.strptime(str_time, "%H:%M:%S").time()
|
|
|
|
|
|
def elapsed_second(str_time1, str_time2):
|
|
sec1 = time_to_second(str_to_time(str_time1))
|
|
sec2 = time_to_second(str_to_time(str_time2))
|
|
result = sec2 - sec1
|
|
if result < 0:
|
|
return -result
|
|
else:
|
|
return result
|
|
|
|
|
|
def diff_ip(input_file1, input_file2, output_file):
|
|
f1 = open(input_file1)
|
|
f2 = open(input_file2)
|
|
f3 = open(output_file, "w")
|
|
l1 = set()
|
|
l2 = set()
|
|
for line in f1:
|
|
l1.add(line)
|
|
for line in f2:
|
|
l2.add(line)
|
|
f1.close()
|
|
f2.close()
|
|
result = l1 - l2
|
|
if not result:
|
|
result = l2 - l1
|
|
for i in result:
|
|
f3.write(i)
|
|
f3.close()
|
|
|
|
|
|
def get_log_files(dir_path, str_date=None):
|
|
f = []
|
|
for (path, dir, files) in os.walk(dir_path):
|
|
for filename in files:
|
|
if not str_date:
|
|
m = re_log_file.match(filename)
|
|
if m:
|
|
f.append(os.path.join(path, filename))
|
|
else:
|
|
if (str_date in filename) and (filename[-4:] == ".log"):
|
|
f.append(os.path.join(path, filename))
|
|
return f
|
|
|
|
|
|
def analyze_crawler_log_dir(dir_path, str_date=None):
|
|
log_files = get_log_files(dir_path, str_date)
|
|
total_result = {k: [0, 0] for k in log_type_list}
|
|
for log_file in log_files:
|
|
result = anlayze_crawler_log_file(log_file)
|
|
log_type = get_log_type(log_file)
|
|
total_result[log_type][0] += result[0]
|
|
total_result[log_type][1] += result[1]
|
|
return total_result
|
|
|
|
|
|
def analyze_crawler_log_dir_day(dir_path, mon=False):
|
|
log_files = get_log_files(dir_path)
|
|
file_prefix = set()
|
|
for log_file in log_files:
|
|
if mon:
|
|
m = re_log_file_mon.search(log_file)
|
|
else:
|
|
m = re_log_file.search(log_file)
|
|
if m:
|
|
file_prefix.add(m.group(1))
|
|
total_result = {k: {v: [0, 0] for v in log_type_list} for k in file_prefix}
|
|
for log_file in log_files:
|
|
result = anlayze_crawler_log_file(log_file)
|
|
log_type = get_log_type(log_file)
|
|
if mon:
|
|
m = re_log_file_mon.search(log_file)
|
|
else:
|
|
m = re_log_file.search(log_file)
|
|
if m:
|
|
total_result[m.group(1)][log_type][0] += result[0]
|
|
total_result[m.group(1)][log_type][1] += result[1]
|
|
return total_result
|
|
|
|
|
|
def print_analyzer_dir_day(dir_path, mon=False):
|
|
total_result = analyze_crawler_log_dir_day(dir_path, mon)
|
|
od = collections.OrderedDict(sorted(total_result.items()))
|
|
for k, v in od.items():
|
|
for x, y in v.items():
|
|
if y[0]:
|
|
print(k, ':', end=' ')
|
|
print("{0:<10}".format(str(x)), end=' ')
|
|
print(":", end=' ')
|
|
print("{0:>11}".format(y[0]), end=' ')
|
|
print("seconds,", end=' ')
|
|
print("{0:>7}".format(y[1]), end=' ')
|
|
print("contents,", end=' ')
|
|
if y[1]:
|
|
print("{0:>7.2f}".format(y[0]/y[1]), end=' ')
|
|
print("seconds/content")
|
|
else:
|
|
print()
|
|
|
|
|
|
def print_analyze_dir(dir_path, str_date=None):
|
|
total_result = analyze_crawler_log_dir(dir_path, str_date)
|
|
od = collections.OrderedDict(sorted(total_result.items()))
|
|
for k, v in od.items():
|
|
try:
|
|
if v[0]:
|
|
print("{0:<10}".format(str(k)), end=' ')
|
|
print(":", end=' ')
|
|
print("{0:>11}".format(v[0]), end=' ')
|
|
print("seconds,", end=' ')
|
|
print("{0:>7}".format(v[1]), end=' ')
|
|
print("contents,", end=' ')
|
|
if v[1]:
|
|
print("{0:>7.2f}".format(v[0]/v[1]), end=' ')
|
|
print("seconds/content")
|
|
else:
|
|
print()
|
|
except Exception as e:
|
|
print("error occur :", e)
|
|
|
|
|
|
def print_analyze_file(file_name):
|
|
total_result = anlayze_crawler_log_file(file_name)
|
|
log_type = get_log_type(file_name)
|
|
try:
|
|
print("{0:<10}".format(str(log_type)), end=' ')
|
|
print(":", end=' ')
|
|
print("{0:>11}".format(total_result[0]), end=' ')
|
|
print("seconds,", end=' ')
|
|
print("{0:>7}".format(total_result[1]), end=' ')
|
|
print("contents,", end=' ')
|
|
if total_result[1]:
|
|
print("{0:>7.2f}".format(total_result[0]/total_result[1]), end=' ')
|
|
print("seconds/content")
|
|
else:
|
|
print()
|
|
except Exception as e:
|
|
print("error occur :", e)
|
|
|
|
|
|
|
|
dir_path = "C:\\data\\crawlerlog\\this"
|
|
print_analyzer_dir_day(dir_path)
|
|
# #str_date = "2016-01"
|
|
# #print_analyze_dir(dir_path, str_date)
|
|
# print_analyze_dir(dir_path)
|
|
# #file_name = "C:\\data\\crawlerlog\\1\\Crawler.01.20\\2016-04-09_27526.log"
|
|
# file_name = "E:\\QtLibrary\\mingw5.4.0.32\\2016-05-25_5656.log"
|
|
# print_analyze_file(file_name)
|
|
# print_analyze_file("C:\\data\\crawlerlog\\this\\2016-05-25_996.log")
|
|
# print_analyze_file("C:\\data\\crawlerlog\\this\\2016-05-25_6936.log")
|
|
# print_analyze_file("C:\\data\\crawlerlog\\this\\2016-05-25_7048.log")
|
|
# print_analyze_file("C:\\data\\crawlerlog\\this\\2016-05-25_9116.log")
|
|
|
|
#print(analyze_crawler_log_dir(dir_path))
|
|
# file1 = "C:\\data\\solidip\\notnaver.txt"
|
|
# file2 = "C:\\data\\solidip\\naver.txt"
|
|
# file3 = "C:\\data\\solidip\\result.txt"
|
|
# diff_ip(file1, file2, file3)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
parser = argparse.ArgumentParser(description='Analyze crawler log file')
|
|
parser.add_argument('-f', action='store', type=str, default=None, help='Analyze a log file')
|
|
parser.add_argument('-d', action='store', type=str, default=None, help='Analyze all log files in directory')
|
|
parser.add_argument('-o', action='store', type=str, default=None, help='Analyze log files containing specified'
|
|
'a string with -d option')
|
|
parser.add_argument('-m', action='store', type=str, default=None, help='Analyze log files monthly or daily. '
|
|
'-m m : monthly, -m d : daily. '
|
|
'This ignore -o option')
|
|
args = parser.parse_args()
|
|
if args.f:
|
|
print_analyze_file(args.f)
|
|
elif args.d:
|
|
if args.m:
|
|
if args.m == 'm':
|
|
print_analyzer_dir_day(args.d, True)
|
|
elif args.d == 'd':
|
|
print_analyzer_dir_day(args.d, False)
|
|
else:
|
|
parser.print_help()
|
|
else:
|
|
print_analyze_dir(args.d, args.o)
|
|
else:
|
|
parser.print_help()
|
|
|