#!/usr/bin/python3 # Readme # ====== # (parts copy/pasted from redd.it/fjgit) # It works like this: # # $ tgrep 8:42:04 # [log lines with that precise timestamp] # $ tgrep 10:01 # [log lines with timestamps between 10:01:00 and 10:01:59] # $ tgrep 23:59-0:03 # [log lines between 23:59:00 and 0:03:59] # # By default it uses /logs/haproxy.log as the input file, but you can specify # an alternate filename by appending it to the command line. It also works if # you prepend it, because who has time to remember the order of arguments for # every little dumb script? # # Most importantly, tgrep is fast, because it doesn't look at every line in # the file. It jumps around, checking timestamps and doing a binary search # until it finds the range you're looking for. # # This program assumes all lines in the log file will matches the following # pattern, which is not an unreasonable assumption, but expect an ugly error # message if a line doesn't match. # Log pattern: \w+\s+\d+\s+\d+:\d+:\d+ # Out of Order Time Stamps # ======================== # In the 'real world' the time stamps aren't going to always increase, but # the log will be in mostly sorted order. max_displacement is included to # increase the range read near by a fixed number of bytes which enables finding # results that are out of order. The actual value of the parameter would need to # be tuned based on real world logs to make sure no data is missed without # making performance worse. # Special Cases # ============= # The log file spans at most two days (crosses midnight at most once), so it # can be broken into two parts: the first day, and possibly a second. # # Start time and end time are provided on the command line (end time defaults # to the start time). If the end time < start time, the span crosses midnight, # in which case the program checks the time ranges (start, midnight) and # (midnight, end) for each day. Otherwise each day is just checked for entries # in the range (start, end). # # If the time range matches on both days, the results from both days will be # shown (maintaining the order in the log) # Performance # =========== # $ tgrep 6:54:03-6:54:05 > output.txt # (max_displacement = 0) # Input: 72.9 GB file (2^14 entries per second over 6:52 - 7:13 the next day) # Output: 5.7 MB file, 98304 entries # Made 3548 system calls, ran in 0.8 seconds (7200rpm SATA300 WDC drive) # The max resident size in memory was under 27 MB. # Setting max_displacement to 1 MB increases the run time to 1.3 seconds # Setting max_displacement to 6 MB increases the run time to 4 seconds # # $ tgrep 8:23:54 > output.txt # took 0.25 seconds and produced one-sixth the output, signaling that the # amount of data being written out is likely the bottle neck. # Setting max_displacement to 1 MB increases the run time to 0.55 seconds # Setting max_displacement to 6 MB increases the run time to 2.6 seconds # # It could be made a bit faster by not using regex's and datetime.time if one # second is just too long to wait. # # Big-O average is O(lg(n)) with respect to the size of the file, but the # pathological case where the last line is more than half the file would make # it O(n). import re import io import sys from datetime import time # Default logs, open the first of these that exists if a file isn't given default_logs = ['/log/haproxy.log'] # Out of order parameter, number of extra byte to check before and after the # actual found range max_displacement = 1 * (1024**2) # Regex's to match beginning of lines in the file and command line arguments log_pattern = re.compile(r"(?P(?P\w+)\s+(?P\d+))\s+(?P