Commit f62b9a22 by Vincent Pelletier

software/erp5: Expose mariadb max-connections.

Auto-compute a value suitable for the number of requested Zope processes
and threads for default ERP5 needs.
parent 896cd97c
......@@ -97,6 +97,11 @@
"default": 1,
"type": "number"
},
"max-connection-count": {
"description": "See MariaDB documentation on max_connections. If not provided, a value suitable for the number of request Zope processes is chosen.",
"minimum": 0,
"type": "integer"
},
"relaxed-writes": {
"description": "When enabled, sets innodb_flush_log_at_trx_commit = 0, innodb_flush_method = nosync, innodb_doublewrite = 0 and sync_frm = 0 - RTFM, those options are dangerous",
"default": false,
......
......@@ -26,7 +26,7 @@ md5sum = d95e8500bdc72d1f40b97cc414656e7e
[template-mariadb]
filename = instance-mariadb.cfg.in
md5sum = 14ae385a0fc5c0b4e03466eb786a451a
md5sum = 9df786692c61bd8d3a6f4e7ed15f272f
[template-kumofs]
filename = instance-kumofs.cfg.in
......@@ -42,7 +42,7 @@ md5sum = d32417746fcf671d4e86a70379815039
[template-my-cnf]
filename = my.cnf.in
md5sum = 7a882ff275f723fdf30869cb7f1b90d3
md5sum = 5a6f337117ba8b72d7fe3b7a9f26f5f6
[template-mariadb-initial-setup]
filename = mariadb_initial_setup.sql.in
......@@ -78,7 +78,7 @@ md5sum = d41d8cd98f00b204e9800998ecf8427e
[template-erp5]
filename = instance-erp5.cfg.in
md5sum = 36f00362c6703fc0a5519f90f733a2fd
md5sum = 7dd00dedef4cc4320ec6977a7e2dc110
[template-zeo]
filename = instance-zeo.cfg.in
......
......@@ -22,6 +22,14 @@
{% set monitor_base_url_dict = {} -%}
{% set monitor_dict = slapparameter_dict.get('monitor', {}) %}
{% set use_ipv6 = slapparameter_dict.get('use-ipv6', False) -%}
{% set partition_thread_count_list = [] -%}
{% set zope_partition_dict = slapparameter_dict.get('zope-partition-dict', {'1': {}}) -%}
{% for zope_parameter_dict in zope_partition_dict.values() -%}
{# Apply some zope_parameter_dict default values, to avoid duplication. -#}
{% do zope_parameter_dict.setdefault('thread-amount', 4) -%}
{% do zope_parameter_dict.setdefault('instance-count', 1) -%}
{% do partition_thread_count_list.append(zope_parameter_dict['thread-amount'] * zope_parameter_dict['instance-count']) -%}
  • {%   do partition_thread_count_list.append(zope_parameter_dict.setdefault('thread-amount', 4) *
                                               zope_parameter_dict.setdefault('instance-count', 1)) -%}
Please register or sign in to reply
{% endfor -%}
[request-common]
<= request-common-base
config-use-ipv6 = {{ dumps(slapparameter_dict.get('use-ipv6', False)) }}
......@@ -94,12 +102,20 @@ backup-caucased = ${:srv}/backup/caucased
{{ request('memcached-persistent', 'kumofs', 'kumofs', {'tcpv4-port': 2000}, {'url': True, 'monitor-base-url': False}, key_config={'monitor-passwd': 'monitor-htpasswd:passwd'}) }}
{{ request('memcached-volatile', 'kumofs', 'memcached', {'tcpv4-port': 2010, 'ram-storage-size': 64}, {'url': True, 'monitor-base-url': False}, key_config={'monitor-passwd': 'monitor-htpasswd:passwd'}) }}
{# Notes on max-connection-count: On a standard ERP5, each transaction
can have 4 connections to mariadb: activities, catalog, deferred and
transactionless. Count 5 to have some headroom. Multiply by the total
number of zope threads for all processes from all partitions to get the
expected number of connections. Add 50 for have some more zope-independent
headroom (automated probes, replication, ...).
-#}
{{ request('mariadb', 'mariadb', 'mariadb',
{
'tcpv4-port': 2099,
'max-slowqueries-threshold': monitor_dict.get('max-slowqueries-threshold', 1000),
'slowest-query-threshold': monitor_dict.get('slowest-query-threshold', ''),
'test-database-amount': test_runner_total_database_count,
'max-connection-count': sum(partition_thread_count_list) * 5 + 50,
},
{
'database-list': True,
......@@ -149,7 +165,6 @@ connection-url = smtp://127.0.0.2:0/
{% endfor -%}
{% set zope_partition_dict = slapparameter_dict.get('zope-partition-dict', {'1': {}}) -%}
{% set zope_address_list_id_dict = {} -%}
{% if zope_partition_dict -%}
......@@ -231,9 +246,9 @@ name = {{ partition_name }}
{% do monitor_base_url_dict.__setitem__(section_name, '${' ~ section_name ~ ':connection-monitor-base-url}') -%}
{{ root_common.sla(partition_name) }}
config-name = {{ dumps(custom_name) }}
config-instance-count = {{ dumps(zope_parameter_dict.get('instance-count', 1)) }}
config-instance-count = {{ dumps(zope_parameter_dict['instance-count']) }}
config-private-dev-shm = {{ zope_parameter_dict.get('private-dev-shm', '') }}
config-thread-amount = {{ dumps(zope_parameter_dict.get('thread-amount', 4)) }}
config-thread-amount = {{ dumps(zope_parameter_dict['thread-amount']) }}
config-timerserver-interval = {{ dumps(zope_parameter_dict.get('timerserver-interval', 5)) }}
config-longrequest-logger-interval = {{ dumps(zope_parameter_dict.get('longrequest-logger-interval', -1)) }}
config-longrequest-logger-timeout = {{ dumps(zope_parameter_dict.get('longrequest-logger-timeout', 1)) }}
......
......@@ -133,6 +133,7 @@ pid-file = ${directory:run}/mariadb.pid
error-log = ${directory:log}/mariadb_error.log
slow-query-log = ${directory:log}/mariadb_slowquery.log
long-query-time = {{ dumps(slapparameter_dict.get('long-query-time', 1)) }}
max-connection-count = {{ dumps(slapparameter_dict.get('max-connection-count', 1000)) }}
innodb-buffer-pool-size = {{ dumps(slapparameter_dict.get('innodb-buffer-pool-size', 0)) }}
innodb-buffer-pool-instances = {{ dumps(slapparameter_dict.get('innodb-buffer-pool-instances', 0)) }}
innodb-log-file-size = {{ dumps(slapparameter_dict.get('innodb-log-file-size', 0)) }}
......
......@@ -31,9 +31,7 @@ innodb_file_per_table = {{ parameter_dict['innodb-file-per-table'] }}
plugin_load = ha_mroonga
# By default only 100 connections are allowed, when using zeo
# we may have much more connections
max_connections = 1000
max_connections = {{ parameter_dict['max-connection-count'] }}
{% set innodb_buffer_pool_size = parameter_dict['innodb-buffer-pool-size'] -%}
{% if innodb_buffer_pool_size %}innodb_buffer_pool_size = {{ innodb_buffer_pool_size }}{% endif %}
......
  • Hello @vpelletier,

    This looks like related to recent ERP5-MASTER generic test regression with error such as "OperationalError: (1040, 'Too many connections')" . Only one other commit was part of regression : 896cd97c . But this commit is probably not related.

    I see here that it looks like you kept the default 1000, but since tests are failing, there must be an issue somewhere.

    /cc @nexedi (to have information on the regression)

  • There's no such default value 1000 commit. I don't understand the purpose of .get(..., 1000) in:

    max-connection-count = {{ dumps(slapparameter_dict.get('max-connection-count', 1000)) }}

    (rather than simply [....], since the value is always there)

    • Do we want to support direct instanciation of mariadb software type ?
    • Is it a problem if the instanciation of the mariadb partition fails temporarily because the value is not propagated yet from the root ?
  • I see here that it looks like you kept the default 1000, but since tests are failing, there must be an issue somewhere.

    The default does not really matter in a "normal" instance tree, as the value is computed from requested zope partitions. How many zopes are requested in test setups ? With how many threads ?

    A possible quick fix would be a min(computed_value, 1000), but I would like to understand how our tests initiate more than 50 connections to mariadb (assuming 0 zopes are requested): even if we count 1 test thread, 1 zope thread (for UI tests), and all connections being used, we should not exceed 8 connections.

    • Do we want to support direct instanciation of mariadb software type ?

    I would like to, yes. This is the intent behind splitting the input schemas: it allows declaring the software types in the base schema and link to their respective input and output schemas. I did not push further in this direction yet, so only the default software type is referenced.

  • A possible quick fix would be a min(computed_value, 1000)

    For reference, here are the fixes I thought of, by rough decreasing preference order:

    • fix any easy connection leak in unittests
    • provide a custom connection count parameter to test nodes if there is an identified but hard connection leak in unittests, so that we get time to eventually fix them
    • increase default connection count offset if there are solid reasons to why we need so many connections, and hopefully the passing value is low enough (<=100)
    • put back such non-linearity in the formula (min(computed_value, 1000))
    Edited by Vincent Pelletier
  • I could reproduce the issue with testERP5Security: this file has 15 test classes, each one gets its own ERP5Site instance, each one with its own 4 SQL connectors. So as test progresses, more and more connections are established, and never deleted nor disconnected. In addition to these, for some reason each connection is established twice: once when creating the instance, and once more while running the test. So while running the 9th class, it exceeded 70 connections and test fails.

    Here are the class counts for the tests which regressed because of this change, which seems to confirm this is the issue (taken from latest result):

    ./product/ERP5/tests/testAccounting.py:11
    ./product/ERP5Security/tests/testERP5Security.py:15
    ./product/ERP5Form/tests/testFields.py:12
    ./product/ERP5/tests/testTradeCondition.py:16

    Some other tests have over 8 classes, so maybe there is something else allowing these to pass (no all classes get run in those tests ? or not in the same process ?).

    I'm looking at how this can be fixed in unit test code, by closing connections in tearDown.

  • mentioned in merge request erp5!911 (closed)

    Toggle commit list
  • Issue should be fixed by latest push in ERP5 repository.

  • So the first "fix any easy connection leak in unittests" was done in erp5@5abb074d but we still observe sometimes some "Too many connections" problems on test nodes sometimes, often testBusinessTemplate is failing, but also sometimes some functional tests like this testPortalContributionsToolNewFile.

    The configuration of ERP5-MASTER test suite is :

    {
        "mariadb": {
            "relaxed-writes": true,
            "mariadb-relaxed-writes": true,
            "test-database-amount": 30
        },
        "cloudooo-url": "https://.....net"
    }

    which gives use 70 max connections: 1 partition * 4 zserver_threads * 5 connections + 50 more.

    We are running 3 runUnitTest processes, some of them are using more connections (with getExtraSqlConnectionStringList ), some or them are spawning other zope nodes (using --activity_node= like conflict resolution test is doing). On the test instance, that zope is started, even though it is not really accessed except by haproxy and some monitoring promises, it probably open a few connections. Maybe also some mysql monitoring and other scripts are consuming connections.

    Even in the worst case, I'm not sure how this adds up to 70.

    There's this test-database-amount parameter that we are using to allocate some extra databases ( for getExtraSqlConnectionStringList ), we should probably add this in the computation of default value of max connections, because currently the connections used by tests have to fit in the extra 50 connections. We could add test-database-amount in connections in the computation of default value (probably multiplied by a factor of 5 for tests using --activity_node= + some safety), formula could become:

    X zopes partitions * Y zserver threads * 5 connections + test-database-amount * 5 connections + 50 extras 

    ... or just use simple approach of min(max-connection-count, 1000) that was suggested earlier.

    As a temporary measure to verify this theory and so that we have less test failures, I reconfigured ERP5-MASTER test suite to set max-connection-count.

  • We could add test-database-amount in connections in the computation of default value

    I guess 30 can be decomposed as 3 * 10, so that one runUnitTest may use up to 10 databases. Is this correct ?

    In this case, it indeed makes perfect sense to involve this number in the maximum connection count, and would explain why I could not see this error when running this test on my own: I ran a single runUnitTest at a time.

    BTW, I'm surprised we would need 10 databases per test. I believe most tests use 1, hot-reindex test at least 2... But I doubt we use many more than that. But that's another topic...

  • It's a bit more complex. For reference, my understanding is that it works like this:

    There are two level of parallelism in test nodes, first we have multiple test nodes and second is that each test node also runs more that one test in parallel (using "node-quantity" parameter), in practice our test nodes have 3.

    ERP5 software release generates a runTestSuite wrapper where all connection strings are passed to runTestSuite --db_list.

    Test nodes executes runTestSuite by passing it's configured "node-quantity". runTestSuite is implemented here for ERP5. It will spawns 3 ("node-quantity") runUnitTest processes, by distributing connection strings to runUnitTest processes, each one receiving the number of mysql_db_count defined on the test suite class configured on Nexedi ERP5.

    For ERP5-MASTER suite, the test suite class is "ERP5", which is defined here, so we have 3 connection strings for each of of the 3 nodes (from "node-quantity").

    image

    When testnode executes the runTestSuite , it's like this:

    runTestSuite --db_list=cnx1,cnx2,cnx3,cnx4,..,cnx100 --node_quantity 3 --test_suite ERP5 ...

    and runTestSuite will execute 3 runUnitTest processes: (BTW, this runUnitTest is not the wrapper generated by ERP5 software release - the runUnitTest wrapper is to run test "manually")

    runUnitTest --erp5_sql_connection_string=cnx1 --extra_sql_connection_string_list=cnx2,cnx3 .. testBusinessTemplate
    runUnitTest --erp5_sql_connection_string=cnx4 --extra_sql_connection_string_list=cnx5,cnx6 .. testERP5Core
    runUnitTest --erp5_sql_connection_string=cnx7 --extra_sql_connection_string_list=cnx8,cnx9 .. testSomethingElse

    You are right that we don't need 10 database per tests and in the case of ERP5-MASTER tests, they will not be used.

    One more thing is that requesting with test-database-amount like we do is deprecated ( since e4d1ea03 ... we can now update test suites configurations, at least for ERP5-MASTER ), ERP5 software release have "better" request parameters. There relevant part of instance schema is this:

        "test-runner": {
          "description": "Test runner parameters.",
          "properties": {
            "enabled": {
              "description": "Generate helper scripts to run test suite.",
              "default": true,
              "type": "boolean"
            },
            "node-count": {
              "description": "Number of tests this instance can execute in parrallel. This must be at least equal to the number of nodes configured on testnode running the test",
              "default": 3,
              "type": "integer"
            },
            "extra-database-count": {
              "description": "Number of extra databases this instance tests will need.",
              "default": 3, 
              "type": "integer"
            }
          },
          "type": "object"
        },

    so in test suite definition we could set test-runner.node-count and test-runner.extra-database-count instead of mysql.test-database-amount. We could even not set it and just leave the default, as it provides more databases that what default ERP5 suite (because ERP5 tests uses 3 databases in total and default parameter is 3 extra database, so 4 in total).

    That's a long explanation, but in the end in erp5 instance we calculate test_runner_total_database_count and what I suggest adding it to the default calculation, maybe something like jerome/slapos@d12d1259

  • mentioned in merge request erp5!995 (merged)

    Toggle commit list
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment