-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_logs.py
78 lines (68 loc) · 2.6 KB
/
parse_logs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import re
import datetime
import numpy
from pyspark.sql.functions import col, stddev, mean, avg, when, log
from pyspark.sql.types import *
from pyspark.sql import Row
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark import sql
from pyspark.ml.feature import OneHotEncoder, StringIndexer
import sys
import os
from math import sqrt
from pyspark.mllib.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.mllib.linalg import Vectors
import pyspark.sql.functions as spf
def parse_apache_time(s):
""" Convert Apache time format into a Python datetime object
Args:
s (str): date and time in Apache time format
Returns:
datetime: datetime object (ignore timezone for now)
"""
month_map = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}
return datetime.datetime(int(s[7:11]),
month_map[s[3:6]],
int(s[0:2]),
int(s[12:14]),
int(s[15:17]),
int(s[18:20]))
def parseApacheLogLine(logline):
""" Parse a line in the Apache Common Log format
Args:
logline (str): a line of text in the Apache Common Log format
Returns:
tuple: either a dictionary containing the parts of the Apache Access Log and 1,
or the original invalid log line and 0
"""
# A regular expression pattern to extract fields from the log line
APACHE_ACCESS_LOG_PATTERN = '^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+)\s*(\S*)" (\d{3}) (\d{4}) "-" "(\S+) \((.*)"'
match = re.search(APACHE_ACCESS_LOG_PATTERN, logline)
if match is None:
return (logline, 0)
size_field = match.group(9)
if size_field == '-':
size = long(0)
else:
size = long(match.group(9))
return (Row(
host=match.group(1),
date_time=parse_apache_time(match.group(4)),
browser=match.group(10),
ua=len(match.group(11))
), 1)
def parseLogs(sc, logFile):
""" Read and parse log file """
parsed_logs = (sc
.textFile(logFile)
.map(parseApacheLogLine)
.cache())
access_logs = (parsed_logs
.filter(lambda s: s[1] == 1)
.map(lambda s: s[0])
.cache())
print 'Read %d lines, successfully parsed %d lines' % (parsed_logs.count(), access_logs.count())
return parsed_logs, access_logs