Commit 4360dbc6 authored by Levin Zimmermann's avatar Levin Zimmermann

restricted: Allow patched pandas.read_* functions

Rationale:

Converting * to data frame / numpy array efficiently is required in all
wendelin projects, without this functionality wendelin is useless.
Currently all projects allow this functionality in an insecure way.
This commit aims to improve the situation by supporting a secure way of
this functionality.

(See wendelin!99 (comment 158474))

Because pandas (in restricted Python) can also be useful in 'pure' ERP5
(without Wendelin) the functionality is added to ERP5 source code.

---

Security:

Security is guaranteed by patching selected read_* functions and
allowing the patched versions. The patch prohibits anything but
string input which directly contains the data (e.g. no urls, file
paths). New unit tests ensure the restrictions of the patches
are actually effective.

---

Notes on implementation decisions:

Instead of offering new ERP5 extension methods (e.g. Base_readJson)
this commit adds patched pandas read functions in restricted Python.
In this way the change of the known API is as minimal as possible.

Instead of globally monkey-patching pandas read_* functions, only the
functions inside restricted python are patched.
In this way the fully-functional, original functions are still available
in Zope products or ERP5 extension code.

Minor changes in the way how pandas is allowed in restricted python
have been applied. Please consult the following discussions in the Merge
request for details:

!1615 (comment 159203)
!1615 (comment 159341)
parent 70b92437
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
# #
############################################################################## ##############################################################################
import json
import os.path import os.path
import tempfile import tempfile
import textwrap import textwrap
...@@ -572,14 +573,6 @@ class TestRestrictedPythonSecurity(ERP5TypeTestCase): ...@@ -572,14 +573,6 @@ class TestRestrictedPythonSecurity(ERP5TypeTestCase):
) )
def testPandasIORead(self): def testPandasIORead(self):
self.assertRaises(
Unauthorized,
self.createAndRunScript,
'''
import pandas as pd
pd.read_csv('testPandasIORead.csv')
''')
# Test the black_list configuration validity # Test the black_list configuration validity
for read_method in pandas_black_list: for read_method in pandas_black_list:
self.assertRaises( self.assertRaises(
...@@ -635,6 +628,148 @@ class TestRestrictedPythonSecurity(ERP5TypeTestCase): ...@@ -635,6 +628,148 @@ class TestRestrictedPythonSecurity(ERP5TypeTestCase):
write_method('testPandasSeriesIOWrite.data') write_method('testPandasSeriesIOWrite.data')
'''.format(write_method=write_method)) '''.format(write_method=write_method))
def _assertPandasRestrictedReadFunctionIsEqualTo(
self, read_function, read_argument, expected_data_frame_init
):
self.createAndRunScript(
'''
import pandas as pd
expected_data_frame = pd.DataFrame({expected_data_frame_init})
return pd.{read_function}({read_argument}).equals(expected_data_frame)
'''.format(
expected_data_frame_init=expected_data_frame_init,
read_function=read_function,
read_argument=read_argument,
),
expected=True
)
def testPandasRestrictedReadFunctionProhibitedInput(self):
"""
Test if patched pandas read_* functions raise with any input which isn't a string.
"""
for pandas_read_function in ("read_json", "read_csv", "read_fwf"):
for preparation, prohibited_input in (
('', 100),
('from StringIO import StringIO', 'StringIO("[1, 2, 3]")'),
):
self.assertRaises(
ZopeGuardsUnauthorized,
self.createAndRunScript,
'''
import pandas as pd
{preparation}
pd.{pandas_read_function}({prohibited_input})
'''.format(
preparation=preparation,
pandas_read_function=pandas_read_function,
prohibited_input=prohibited_input,
)
)
def testPandasReadFwf(self):
read_function = "read_fwf"
# Normal input should be correctly handled
self._assertPandasRestrictedReadFunctionIsEqualTo(
read_function, r'"100\n200"', r"[[200]], columns=['100']",
)
# Ensure monkey patch parses keyword arguments to patched function
self._assertPandasRestrictedReadFunctionIsEqualTo(
read_function, r'"1020\n3040", widths=[2, 2]', r"[[30, 40]], columns=['10', '20']",
)
# A string containing an url or file path should be handled as if
# it would be a normal csv string entry
self._assertPandasRestrictedReadFunctionIsEqualTo(
read_function,
r'"file://path/to/fwf/file.fwf"',
r"[], columns=['file://path/to/fwf/file.fwf']",
)
def testPandasReadCSV(self):
read_function = "read_csv"
# Normal input should be correctly handled
self._assertPandasRestrictedReadFunctionIsEqualTo(
read_function,
r'"11,2,300\n50.5,99,hello"',
r"[[50.5, 99, 'hello']], columns='11 2 300'.split(' ')",
)
# Ensure monkey patch parses keyword arguments to patched function
self._assertPandasRestrictedReadFunctionIsEqualTo(
read_function, r'"a;b", sep=";"', r"[], columns=['a', 'b']",
)
# A string containing an url or file path should be handled as if
# it would be a normal csv string entry
self._assertPandasRestrictedReadFunctionIsEqualTo(
read_function,
r'"https://people.sc.fsu.edu/~jburkardt/data/csv/addresses.csv"',
r"[], columns=['https://people.sc.fsu.edu/~jburkardt/data/csv/addresses.csv']",
)
self._assertPandasRestrictedReadFunctionIsEqualTo(
read_function,
r'"file://path/to/csv/file.csv"',
r"[], columns=['file://path/to/csv/file.csv']",
)
def testPandasReadJsonParsesInput(self):
read_function = "read_json"
# Normal input should be correctly handled
self._assertPandasRestrictedReadFunctionIsEqualTo(
read_function, '"[1, 2, 3]"', "[1, 2, 3]"
)
self._assertPandasRestrictedReadFunctionIsEqualTo(
read_function,
'\'{"column_name": [1, 2, 3], "another_column": [3, 9.2, 100]}\'',
'{"column_name": [1, 2, 3], "another_column": [3, 9.2, 100]}',
)
# Ensure monkey patch parses keyword arguments to patched function
self._assertPandasRestrictedReadFunctionIsEqualTo(
read_function,
r'"[1, 2, 3]\n[4, 5, 6]", lines=True',
"[[1, 2, 3], [4, 5, 6]]",
)
# URLs, etc. should raise a ValueError
# (see testPandasReadJsonProhibitsMalicousString)
def testPandasReadJsonProhibitsMalicousString(self):
"""
Test if file path, urls and other bad strings
raise value errors
"""
# Create valid json file which could be read
# by a non-patched read_json function.
test_file_path = ".testPandasReadJson.json"
json_test_data = [1, 2, 3]
with open(test_file_path, 'w') as json_file:
json.dump(json_test_data, json_file)
self.addCleanup(os.remove, test_file_path)
# Ensure json creation was successful
self.assertTrue(os.path.isfile(test_file_path))
with open(test_file_path, "r") as json_file:
self.assertEqual(json_test_data, json.loads(json_file.read()))
for malicous_input in (
# If pandas would read this as an URL it should
# raise an URLError. But because it will try
# to read it as a json string, it will raise
# a ValueError.
"https://test-url.com/test-name.json",
"file://path/to/json/file.json",
# This shouldn't raise any error in case
# pandas read function wouldn't be patched.
test_file_path,
# Gibberish should also raise a ValueError
"Invalid-string"
):
self.assertRaises(
ValueError,
self.createAndRunScript,
'''
import pandas as pd
pd.read_json("{}")
'''.format(malicous_input)
)
def test_suite(): def test_suite():
suite = unittest.TestSuite() suite = unittest.TestSuite()
......
##############################################################################
#
# Copyright (c) 2012 Nexedi SARL and Contributors. All Rights Reserved.
# Levin Zimmermann <levin.zimmermann@nexedi.com>
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsability of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# garantees and support are strongly adviced to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
##############################################################################
"""
Restricted pandas module.
From restricted python, use "import pandas" (see patches/Restricted.py).
"""
from pandas import *
# Add restricted versions of IO functions
import six as _six
from AccessControl.ZopeGuards import Unauthorized as _ZopeGuardsUnauthorized
if _six.PY2:
from StringIO import StringIO as _StringIO
else:
from io import StringIO as _StringIO
def _addRestrictedPandasReadFunction(function_name):
original_function = getattr(__import__('pandas'), function_name)
def Pandas_read(data_string, *args, **kwargs):
# Strict: don't use 'isinstance', only allow buildin str
# objects
if type(data_string) is not str:
raise _ZopeGuardsUnauthorized(
"Parsing object '%s' of type '%s' is prohibited!" % (data_string, type(data_string))
)
string_io = _StringIO(data_string)
return original_function(string_io, *args, **kwargs)
disclaimer = """\n
Disclaimer:
This function has been patched by ERP5 for zope sandbox usage.
Only objects of type 'str' are valid inputs, file paths, files,
urls, etc. are prohibited or ignored.
"""
Pandas_read.__doc__ = original_function.__doc__ + disclaimer
globals().update({function_name: Pandas_read})
def _addRestrictedPandasReadFunctionTuple():
pandas_read_function_to_restrict_tuple = (
"read_json",
# "read_html", # needs installation of additional dependency: html5lib
"read_csv",
"read_fwf",
# "read_xml", # only available for pandas version >= 1.3.0
)
for pandas_read_function_to_restrict in pandas_read_function_to_restrict_tuple:
_addRestrictedPandasReadFunction(pandas_read_function_to_restrict)
_addRestrictedPandasReadFunctionTuple()
\ No newline at end of file
...@@ -371,6 +371,7 @@ MNAME_MAP = { ...@@ -371,6 +371,7 @@ MNAME_MAP = {
'calendar': 'Products.ERP5Type.Calendar', 'calendar': 'Products.ERP5Type.Calendar',
'collections': 'Products.ERP5Type.Collections', 'collections': 'Products.ERP5Type.Collections',
'six': 'Products.ERP5Type.Six', 'six': 'Products.ERP5Type.Six',
'pandas': 'Products.ERP5Type.Pandas',
} }
for alias, real in six.iteritems(MNAME_MAP): for alias, real in six.iteritems(MNAME_MAP):
assert '.' not in alias, alias # TODO: support this assert '.' not in alias, alias # TODO: support this
...@@ -478,23 +479,20 @@ def restrictedMethod(s,name): ...@@ -478,23 +479,20 @@ def restrictedMethod(s,name):
raise Unauthorized(name) raise Unauthorized(name)
return dummyMethod return dummyMethod
try: try:
import pandas as pd import pandas as pd
except ImportError: except ImportError:
pass pass
else: else:
allow_module('pandas')
allow_type(pd.Series)
allow_type(pd.Timestamp) allow_type(pd.Timestamp)
allow_type(pd.DatetimeIndex) allow_type(pd.DatetimeIndex)
# XXX: pd.DataFrame has its own security thus disable
# until we can fully integrate it
#allow_type(pd.DataFrame)
allow_type(pd.MultiIndex) allow_type(pd.MultiIndex)
allow_type(pd.indexes.range.RangeIndex) allow_type(pd.indexes.range.RangeIndex)
allow_type(pd.indexes.numeric.Int64Index) allow_type(pd.indexes.numeric.Int64Index)
allow_type(pd.core.groupby.DataFrameGroupBy) allow_type(pd.core.groupby.DataFrameGroupBy)
allow_type(pd.core.groupby.SeriesGroupBy) allow_type(pd.core.groupby.SeriesGroupBy)
allow_class(pd.DataFrame) allow_class(pd.DataFrame)
# Note: These black_list methods are for pandas 0.19.2 # Note: These black_list methods are for pandas 0.19.2
...@@ -503,10 +501,10 @@ else: ...@@ -503,10 +501,10 @@ else:
ContainerAssertions[pd.Series] = _check_access_wrapper( ContainerAssertions[pd.Series] = _check_access_wrapper(
pd.Series, dict.fromkeys(series_black_list, restrictedMethod)) pd.Series, dict.fromkeys(series_black_list, restrictedMethod))
pandas_black_list = ('read_csv', 'read_json', 'read_pickle', 'read_hdf', pandas_black_list = ('read_pickle', 'read_hdf',
'read_fwf', 'read_excel', 'read_html', 'read_msgpack', 'read_excel', 'read_html', 'read_msgpack',
'read_gbq', 'read_sas', 'read_stata') 'read_gbq', 'read_sas', 'read_stata')
ModuleSecurityInfo('pandas').declarePrivate(*pandas_black_list) ModuleSecurityInfo(MNAME_MAP['pandas']).declarePrivate(*pandas_black_list)
dataframe_black_list = ('to_csv', 'to_json', 'to_pickle', 'to_hdf', dataframe_black_list = ('to_csv', 'to_json', 'to_pickle', 'to_hdf',
'to_excel', 'to_html', 'to_sql', 'to_msgpack', 'to_excel', 'to_html', 'to_sql', 'to_msgpack',
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment