Commit 128167a3 authored by Jérome Perrin's avatar Jérome Perrin Committed by Vincent Pelletier

Support non escaped referer in log

Unlike apache which escape non ascii characters in referrer, caddy
writes referrer as is. Edge seem to send referrer not escaped, so with
Edge and caddy we can have non ascii text in referrer.

For lines which cannot be decoded as ASCII, we use python `replace`
error handler which would in this case allow the line to be processed if
the decoding problem is only about the encoding of the referrer.

We don't implement this case as "skip and report ill-formed line",
because python does not provide utilities to do this easily.

Reproduction with caddy:

```
curl -k http://localhost -H 'Referer: héhé'
```

With apache, `LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\"" common`
```
127.0.0.1 - - [28/Feb/2019:10:03:33 +0100] "GET / HTTP/1.1" 200 2046 "h\xc3\xa9h\xc3\xa9" "curl/7.50.1" 4
```

With caddy, `log / stdout "{remote} {>REMOTE_USER} [{when}] \"{method} {uri} {proto}\" {status} {size} \"{>Referer}\" \"{>User-Agent}\" {latency_ms}"`

```
127.0.0.1 - [28/Feb/2019:10:05:00 +0100] "GET / HTTP/2.0" 200 1950 "héhé" "curl/7.50.1" 4
```
parent 48392feb
...@@ -103,7 +103,11 @@ if lzma is not None: ...@@ -103,7 +103,11 @@ if lzma is not None:
# XXX: what encoding ? apache doesn't document one, but requests are supposed # XXX: what encoding ? apache doesn't document one, but requests are supposed
# to be urlencoded, so pure ascii. Are timestamps localised ? # to be urlencoded, so pure ascii. Are timestamps localised ?
# Unlike apache, Caddy does not escape referrer headers, so caddy log files may contain
# non ascii characters.
# We read them as ascii, replacing non-ascii characters by unicode replacement character.
INPUT_ENCODING = 'ascii' INPUT_ENCODING = 'ascii'
INPUT_ENCODING_ERROR_HANDLER = 'replace'
MONTH_VALUE_DICT = dict((y, x) for (x, y) in enumerate(('Jan', 'Feb', 'Mar', MONTH_VALUE_DICT = dict((y, x) for (x, y) in enumerate(('Jan', 'Feb', 'Mar',
'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'), 1)) 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'), 1))
...@@ -1452,7 +1456,7 @@ def main(): ...@@ -1452,7 +1456,7 @@ def main():
logfile = sys.stdin logfile = sys.stdin
else: else:
for opener, exc in FILE_OPENER_LIST: for opener, exc in FILE_OPENER_LIST:
logfile = opener(filename, _read_mode, encoding=INPUT_ENCODING) logfile = opener(filename, _read_mode, encoding=INPUT_ENCODING, errors=INPUT_ENCODING_ERROR_HANDLER)
try: try:
logfile.readline() logfile.readline()
except exc: except exc:
...@@ -1461,7 +1465,7 @@ def main(): ...@@ -1461,7 +1465,7 @@ def main():
logfile.seek(0) logfile.seek(0)
break break
else: else:
logfile = codecs.open(filename, _read_mode, encoding=INPUT_ENCODING) logfile = codecs.open(filename, _read_mode, encoding=INPUT_ENCODING, errors=INPUT_ENCODING_ERROR_HANDLER)
lineno = 0 lineno = 0
for lineno, line in enumerate(logfile, 1): for lineno, line in enumerate(logfile, 1):
if show_progress and lineno % 5000 == 0: if show_progress and lineno % 5000 == 0:
......
import unittest import unittest
import sys import sys
import json
from StringIO import StringIO from StringIO import StringIO
import tempfile
import apachedex import apachedex
class MalformedInputTestCase(unittest.TestCase): class ApacheDEXTestCase(unittest.TestCase):
def setUp(self): def setUp(self):
self._original_sys_argv = sys.argv self._original_sys_argv = sys.argv
self._original_sys_stdin = sys.stdin self._original_sys_stdin = sys.stdin
...@@ -19,6 +21,8 @@ class MalformedInputTestCase(unittest.TestCase): ...@@ -19,6 +21,8 @@ class MalformedInputTestCase(unittest.TestCase):
sys.stderr = self._original_sys_stderr sys.stderr = self._original_sys_stderr
sys.stdout = self._original_sys_stdout sys.stdout = self._original_sys_stdout
class TestMalformedInput(ApacheDEXTestCase):
def test_timestamp_mixed_in_timestamp(self): def test_timestamp_mixed_in_timestamp(self):
sys.argv = ['apachedex', '--base=/', '-'] sys.argv = ['apachedex', '--base=/', '-']
sys.stdin = StringIO( sys.stdin = StringIO(
...@@ -30,3 +34,29 @@ class MalformedInputTestCase(unittest.TestCase): ...@@ -30,3 +34,29 @@ class MalformedInputTestCase(unittest.TestCase):
self.assertNotIn('Malformed line at -:1', sys.stderr.getvalue()) self.assertNotIn('Malformed line at -:1', sys.stderr.getvalue())
self.assertIn('Malformed line at -:2', sys.stderr.getvalue()) self.assertIn('Malformed line at -:2', sys.stderr.getvalue())
class TestCharacterEncoding(ApacheDEXTestCase):
def test_apache_referer_encoding(self):
with tempfile.NamedTemporaryFile() as fin, tempfile.NamedTemporaryFile() as fout:
# with apache, referer is "backslash escaped" (but quite often, referrer is %-encoded by user agent, like on
# this example line taken from request-caddy-frontend-1/SOFTINST-49218_access_log-20190220 )
fin.write(
b'127.0.0.1 -- [19/Feb/2019:17:49:22 +0100] "POST /erp5/budget_module/20181219-2B1DB4A/1/Base_edit HTTP/1.1" 302 194 "https://example.org/erp5/budget_module/20181219-2B1DB4A/1/BudgetLine_viewSpreadsheet?selection_index=0&selection_name=budget_line_list_selection&ignore_layout:int=1&editable_mode=1&portal_status_message=Donn%C3%A9es%20enregistr%C3%A9es." "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36" 2999\n')
fin.flush()
sys.argv = ['apachedex', '--base=/', fin.name, '-f', 'json', '-o', fout.name]
apachedex.main()
self.assertNotIn('Malformed line', sys.stderr.getvalue())
with open(fout.name) as f:
self.assertTrue(json.load(f))
def test_caddy_referer_encoding(self):
with tempfile.NamedTemporaryFile() as fin, tempfile.NamedTemporaryFile() as fout:
# with caddy, referer is written "as is"
fin.write(
# this is an (anonymised) line from request-caddy-frontend-1/SOFTINST-49218_access_log-20190220
b'127.0.0.1 - - [19/Feb/2019:17:49:22 +0100] "GET / HTTP/1.1" 200 741 "https://example.org/erp5/budget_module/20190219-1F39610/9/BudgetLine_viewSpreadsheet?selection_index=4&selection_name=budget_line_list_selection&ignore_layout:int=1&editable_mode=1&portal_status_message=Donn\xe9es%20enregistr\xe9es." "Mozilla/5.0 (Windows NT 10.0; Win64;x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134" 7')
fin.flush()
sys.argv = ['apachedex', '--base=/', fin.name, '-f', 'json', '-o', fout.name]
apachedex.main()
with open(fout.name) as f:
self.assertTrue(json.load(f))
...@@ -88,6 +88,7 @@ setup( ...@@ -88,6 +88,7 @@ setup(
package_data={ package_data={
'apachedex': list(DEPS.keys()) + ['apachedex.js', 'apachedex.css'], 'apachedex': list(DEPS.keys()) + ['apachedex.js', 'apachedex.css'],
}, },
test_suite='apachedex.tests',
zip_safe=True, zip_safe=True,
**extra **extra
) )
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment