1
0
mirror of https://github.com/bigchaindb/bigchaindb.git synced 2024-06-26 03:06:43 +02:00
bigchaindb/k8s/logging-and-monitoring/analyze.py
David Dashyan 9e99c024d3
Replace headers (#2683)
Signed-off-by: David Dashyan <mail@davie.li>
2020-04-06 11:52:18 +02:00

83 lines
2.9 KiB
Python

# Copyright © 2020 Interplanetary Database Association e.V.,
# BigchainDB and IPDB software contributors.
# SPDX-License-Identifier: (Apache-2.0 AND CC-BY-4.0)
# Code is Apache-2.0 and docs are CC-BY-4.0
"""
A little Python script to do some analysis of the NGINX logs.
To get the relevant NGINX logs:
1. Go to the OMS Portal
2. Create a new Log Search
3. Use a search string such as:
Type=ContainerLog Image="bigchaindb/nginx_3scale:1.3" GET NOT("Go-http-client") NOT(runscope)
(This gets all logs from the NGINX container, only those with the word "GET",
excluding those with the string "Go-http-client" [internal Kubernetes traffic],
excluding those with the string "runscope" [Runscope tests].)
4. In the left sidebar, at the top, use the dropdown menu to select the time range,
e.g. "Data based on last 7 days". Pay attention to the number of results and
the time series chart in the left sidebar. Are there any spikes?
5. Export the search results. A CSV file will be saved on your local machine.
6. $ python3 analyze.py logs.csv
Thanks to https://gist.github.com/hreeder/f1ffe1408d296ce0591d
"""
import sys
import csv
import re
from dateutil.parser import parse
lineformat = re.compile(r'(?P<ipaddress>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - '
r'\[(?P<dateandtime>\d{2}\/[a-z]{3}\/\d{4}:\d{2}:\d{2}:\d{2} '
r'(\+|\-)\d{4})\] ((\"(GET|POST) )(?P<url>.+)(http\/1\.1")) '
r'(?P<statuscode>\d{3}) '
r'(?P<bytessent>\d+) '
r'(["](?P<refferer>(\-)|(.+))["]) '
r'(["](?P<useragent>.+)["])',
re.IGNORECASE)
filepath = sys.argv[1]
logline_list = []
with open(filepath) as csvfile:
csvreader = csv.reader(csvfile, delimiter=',')
for row in csvreader:
if row and (row[8] != 'LogEntry'):
# because the first line is just the column headers, such as 'LogEntry'
logline = row[8]
print(logline + '\n')
logline_data = re.search(lineformat, logline)
if logline_data:
logline_dict = logline_data.groupdict()
logline_list.append(logline_dict)
# so logline_list is a list of dicts
# print('{}'.format(logline_dict))
# Analysis
total_bytes_sent = 0
tstamp_list = []
for lldict in logline_list:
total_bytes_sent += int(lldict['bytessent'])
dt = lldict['dateandtime']
# https://tinyurl.com/lqjnhot
dtime = parse(dt[:11] + " " + dt[12:])
tstamp_list.append(dtime.timestamp())
print('Number of log lines seen: {}'.format(len(logline_list)))
# Time range
trange_sec = max(tstamp_list) - min(tstamp_list)
trange_days = trange_sec / 60.0 / 60.0 / 24.0
print('Time range seen (days): {}'.format(trange_days))
print('Total bytes sent: {}'.format(total_bytes_sent))
print('Average bytes sent per day (out via GET): {}'.
format(total_bytes_sent / trange_days))