Commit 128167a3 authored by Jérome Perrin's avatar Jérome Perrin Committed by Vincent Pelletier

Support non escaped referer in log

Unlike apache which escape non ascii characters in referrer, caddy
writes referrer as is. Edge seem to send referrer not escaped, so with
Edge and caddy we can have non ascii text in referrer.

For lines which cannot be decoded as ASCII, we use python `replace`
error handler which would in this case allow the line to be processed if
the decoding problem is only about the encoding of the referrer.

We don't implement this case as "skip and report ill-formed line",
because python does not provide utilities to do this easily.

Reproduction with caddy:

```
curl -k http://localhost -H 'Referer: héhé'
```

With apache, `LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\"" common`
```
127.0.0.1 - - [28/Feb/2019:10:03:33 +0100] "GET / HTTP/1.1" 200 2046 "h\xc3\xa9h\xc3\xa9" "curl/7.50.1" 4
```

With caddy, `log / stdout "{remote} {>REMOTE_USER} [{when}] \"{method} {uri} {proto}\" {status} {size} \"{>Referer}\" \"{>User-Agent}\" {latency_ms}"`

```
127.0.0.1 - [28/Feb/2019:10:05:00 +0100] "GET / HTTP/2.0" 200 1950 "héhé" "curl/7.50.1" 4
```
parent 48392feb
......@@ -103,7 +103,11 @@ if lzma is not None:
# XXX: what encoding ? apache doesn't document one, but requests are supposed
# to be urlencoded, so pure ascii. Are timestamps localised ?
# Unlike apache, Caddy does not escape referrer headers, so caddy log files may contain
# non ascii characters.
# We read them as ascii, replacing non-ascii characters by unicode replacement character.
INPUT_ENCODING = 'ascii'
INPUT_ENCODING_ERROR_HANDLER = 'replace'
MONTH_VALUE_DICT = dict((y, x) for (x, y) in enumerate(('Jan', 'Feb', 'Mar',
'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'), 1))
......@@ -1452,7 +1456,7 @@ def main():
logfile = sys.stdin
else:
for opener, exc in FILE_OPENER_LIST:
logfile = opener(filename, _read_mode, encoding=INPUT_ENCODING)
logfile = opener(filename, _read_mode, encoding=INPUT_ENCODING, errors=INPUT_ENCODING_ERROR_HANDLER)
try:
logfile.readline()
except exc:
......@@ -1461,7 +1465,7 @@ def main():
logfile.seek(0)
break
else:
logfile = codecs.open(filename, _read_mode, encoding=INPUT_ENCODING)
logfile = codecs.open(filename, _read_mode, encoding=INPUT_ENCODING, errors=INPUT_ENCODING_ERROR_HANDLER)
lineno = 0
for lineno, line in enumerate(logfile, 1):
if show_progress and lineno % 5000 == 0:
......
import unittest
import sys
import json
from StringIO import StringIO
import tempfile
import apachedex
class MalformedInputTestCase(unittest.TestCase):
class ApacheDEXTestCase(unittest.TestCase):
def setUp(self):
self._original_sys_argv = sys.argv
self._original_sys_stdin = sys.stdin
......@@ -19,6 +21,8 @@ class MalformedInputTestCase(unittest.TestCase):
sys.stderr = self._original_sys_stderr
sys.stdout = self._original_sys_stdout
class TestMalformedInput(ApacheDEXTestCase):
def test_timestamp_mixed_in_timestamp(self):
sys.argv = ['apachedex', '--base=/', '-']
sys.stdin = StringIO(
......@@ -30,3 +34,29 @@ class MalformedInputTestCase(unittest.TestCase):
self.assertNotIn('Malformed line at -:1', sys.stderr.getvalue())
self.assertIn('Malformed line at -:2', sys.stderr.getvalue())
class TestCharacterEncoding(ApacheDEXTestCase):
def test_apache_referer_encoding(self):
with tempfile.NamedTemporaryFile() as fin, tempfile.NamedTemporaryFile() as fout:
# with apache, referer is "backslash escaped" (but quite often, referrer is %-encoded by user agent, like on
# this example line taken from request-caddy-frontend-1/SOFTINST-49218_access_log-20190220 )
fin.write(
b'127.0.0.1 -- [19/Feb/2019:17:49:22 +0100] "POST /erp5/budget_module/20181219-2B1DB4A/1/Base_edit HTTP/1.1" 302 194 "https://example.org/erp5/budget_module/20181219-2B1DB4A/1/BudgetLine_viewSpreadsheet?selection_index=0&selection_name=budget_line_list_selection&ignore_layout:int=1&editable_mode=1&portal_status_message=Donn%C3%A9es%20enregistr%C3%A9es." "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36" 2999\n')
fin.flush()
sys.argv = ['apachedex', '--base=/', fin.name, '-f', 'json', '-o', fout.name]
apachedex.main()
self.assertNotIn('Malformed line', sys.stderr.getvalue())
with open(fout.name) as f:
self.assertTrue(json.load(f))
def test_caddy_referer_encoding(self):
with tempfile.NamedTemporaryFile() as fin, tempfile.NamedTemporaryFile() as fout:
# with caddy, referer is written "as is"
fin.write(
# this is an (anonymised) line from request-caddy-frontend-1/SOFTINST-49218_access_log-20190220
b'127.0.0.1 - - [19/Feb/2019:17:49:22 +0100] "GET / HTTP/1.1" 200 741 "https://example.org/erp5/budget_module/20190219-1F39610/9/BudgetLine_viewSpreadsheet?selection_index=4&selection_name=budget_line_list_selection&ignore_layout:int=1&editable_mode=1&portal_status_message=Donn\xe9es%20enregistr\xe9es." "Mozilla/5.0 (Windows NT 10.0; Win64;x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134" 7')
fin.flush()
sys.argv = ['apachedex', '--base=/', fin.name, '-f', 'json', '-o', fout.name]
apachedex.main()
with open(fout.name) as f:
self.assertTrue(json.load(f))
......@@ -88,6 +88,7 @@ setup(
package_data={
'apachedex': list(DEPS.keys()) + ['apachedex.js', 'apachedex.css'],
},
test_suite='apachedex.tests',
zip_safe=True,
**extra
)
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment