Commit 19283f5c authored by Alain Takoudjou's avatar Alain Takoudjou

Jscrawler: Javascript Nodejs website crawler

See merge request nexedi/slapos!904
parents 7bc19d84 48ae09ff
# THIS IS NOT A BUILDOUT FILE, despite purposedly using a compatible syntax.
# The only allowed lines here are (regexes):
# - "^#" comments, copied verbatim
# - "^[" section beginings, copied verbatim
# - lines containing an "=" sign which must fit in the following categorie.
# - "^\s*filename\s*=\s*path\s*$" where "path" is relative to this file
# Copied verbatim.
# - "^\s*hashtype\s*=.*" where "hashtype" is one of the values supported
# by the re-generation script.
# Re-generated.
# - other lines are copied verbatim
# Substitution (${...:...}), extension ([buildout] extends = ...) and
# section inheritance (< = ...) are NOT supported (but you should really
# not need these here).
[instance]
md5sum = 6c17361a49cfc47564063b867aab6e8c
filename = instance.cfg.in
[template-jscrawler]
md5sum = 8fa425275120e8ba5c466aff00b48e7b
filename = instance-jscrawler.cfg.jinja2.in
[template-jscrawler-builder]
md5sum = c5e8f8b3983d5f572a564b34fa0f7499
filename = template-jscrawler.builder.sh.in
{
"type": "object",
"$schema": "http://json-schema.org/draft-06/schema",
"title": "Input Parameters",
"properties": {
"urls": {
"title": "List of website URL to crawl (newline separated).",
"description": "List of website URL to crawl (newline separated).",
"type": "string",
"textarea": true,
"default": ""
},
"crawl-periodicity": {
"title": "Number of days periodicity before re-crawl a site.",
"description": "Number of days before next crawl of URL will start and regenerate the sitemap. Set 0 to disable re-crawl.",
"type": "integer",
"default": 0,
"enum": [
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
15,
20,
30,
60
]
}
}
}
{
"name": "Output Parameters",
"properties": {
"url": {
"title": "Frontend http server URL",
"description": "Frontend http server URL for sitemap download.",
"type": "string",
"format": "uri"
}
}
}
[directory]
recipe = slapos.cookbook:mkdirectory
etc = ${buildout:directory}/etc
bin = ${buildout:directory}/bin
srv = ${buildout:directory}/srv
var = ${buildout:directory}/var
run = ${:var}/run
log = ${:var}/log
scripts = ${:etc}/run
services = ${:etc}/service
plugins = ${:etc}/plugin
ssl = ${:etc}/ssl
www = ${:srv}/www
tmp = ${:srv}/tmp
#################################
# httpd service
#################################
[gen-certificate]
recipe = plone.recipe.command
command = "{{ parameter_dict['openssl'] }}" req -newkey rsa -batch -new -x509 -days 3650 -nodes -keyout "${:ssl_key}" -out "${:ssl_crt}"
stop-on-error = true
ssl_crt = ${directory:ssl}/httpd.crt
ssl_key = ${directory:ssl}/httpd.key
[httpd-wrapper]
recipe = slapos.cookbook:simplehttpserver
host = {{ (ipv6 | list)[0] }}
port = 9083
base-path = ${directory:www}
wrapper = ${directory:services}/http-server
log-file = ${directory:log}/httpd.log
use-hash-url = false
cert-file = ${gen-certificate:ssl_crt}
key-file = ${gen-certificate:ssl_key}
[request-jscrawler-frontend]
<= slap-connection
recipe = slapos.cookbook:requestoptional
name = jscawler Frontend
# XXX We have hardcoded SR URL here.
software-url = http://git.erp5.org/gitweb/slapos.git/blob_plain/HEAD:/software/apache-frontend/software.cfg
slave = true
config-url = https://[${httpd-wrapper:host}]:${httpd-wrapper:port}
return = secure_access domain
[jscrawler-frontend-promise]
<= monitor-promise-base
module = check_url_available
name = jscrawler_frontend.py
config-url = ${request-jscrawler-frontend:connection-secure_access}
config-check-secure = 1
[logrotate-entry-httpd]
<= logrotate-entry-base
name = http-server
log = ${httpd-wrapper:log-file}
[httpd-listen-promise]
<= monitor-promise-base
module = check_port_listening
name = httpd-listen.py
config-hostname = ${httpd-wrapper:host}
config-port = ${httpd-wrapper:port}
[jscrawler-wrapper]
recipe = slapos.cookbook:wrapper
command-line =
{{ parameter_dict['nodejs-location'] }} {{ parameter_dict['jscrawler-location'] }}
wrapper-path = ${directory:bin}/jscrawler
[jscrawler-build-wrapper]
recipe = slapos.recipe.template:jinja2
template = {{ parameter_dict['template-jscrawler'] }}
rendered = ${directory:bin}/jscrawler-build
extensions = jinja2.ext.do
mode = 0700
list = {{ slapparameter_dict.get('urls', "").split("\n") | join('\n ') }}
period = {{ slapparameter_dict.get('crawl-periodicity', 0) }}
context =
key public_folder directory:www
key tmp_folder directory:tmp
key jscrawler_wrapper jscrawler-wrapper:wrapper-path
key period :period
key url_list :list
raw shell_binary {{ bash_executable_location }}
raw pid_file ${directory:run}/jscrawler.pid
[cron-entry-build-sitemap]
<= cron
recipe = slapos.cookbook:cron.d
name = jscrawler-build
frequency = * * * * *
command = ${jscrawler-build-wrapper:rendered}
[publish-connection-information]
<= monitor-publish
recipe = slapos.cookbook:publish.serialised
url = ${request-jscrawler-frontend:connection-secure_access}
[buildout]
extends = {{ template_monitor }}
parts =
publish-connection-information
logrotate-entry-httpd
# crawler cron
cron-entry-build-sitemap
httpd-wrapper
httpd-listen-promise
jscrawler-frontend-promise
eggs-directory = {{ eggs_directory }}
develop-eggs-directory = {{ develop_eggs_directory }}
offline = true
[buildout]
parts = switch-softwaretype
# std stuff for slapos instance
eggs-directory = {{ buildout_egg_directory }}
develop-eggs-directory = {{ buildout_develop_directory }}
offline = true
[switch-softwaretype]
recipe = slapos.cookbook:switch-softwaretype
default = dynamic-template-jscrawler:rendered
RootSoftwareInstance = ${:default}
[slap-configuration]
recipe = slapos.cookbook:slapconfiguration.serialised
computer = ${slap-connection:computer-id}
partition = ${slap-connection:partition-id}
url = ${slap-connection:server-url}
key = ${slap-connection:key-file}
cert = ${slap-connection:cert-file}
[jinja2-template-base]
recipe = slapos.recipe.template:jinja2
rendered = ${buildout:directory}/${:filename}
extensions = jinja2.ext.do
mode = 0644
extra-context =
context =
key develop_eggs_directory buildout:develop-eggs-directory
key buildout_directory buildout:directory
key eggs_directory buildout:eggs-directory
key ipv4 slap-configuration:ipv4
key ipv6 slap-configuration:ipv6
key global_ipv4_prefix network-information:global-ipv4-network
key slapparameter_dict slap-configuration:configuration
key computer_id slap-configuration:computer
raw bash_executable_location {{ bash_location }}/bin/dash
raw template_monitor {{ template_monitor_cfg }}
raw logrotate_cfg {{ logrotate_cfg }}
${:extra-context}
[dynamic-template-jscrawler-parameters]
openssl = {{ openssl_location }}/bin/openssl
jscrawler-location = {{ jscrawler_location }}/crawler.js
nodejs-location = {{ nodejs_location }}/bin/node
template-jscrawler = {{ template_jscrawler_builder }}
[dynamic-template-jscrawler]
<= jinja2-template-base
template = {{ template_jscrawler }}
filename = instance-jscrawler.cfg
extra-context =
section parameter_dict dynamic-template-jscrawler-parameters
[buildout]
extends =
../../component/bash/buildout.cfg
../../component/openssl/buildout.cfg
../../component/curl/buildout.cfg
../../component/git/buildout.cfg
../../stack/slapos.cfg
../../stack/nodejs.cfg
../../stack/monitor/buildout.cfg
./buildout.hash.cfg
parts =
slapos-cookbook
nodejs
openssl
jscrawler-build
instance
[nodejs]
<= nodejs-12.18.3
[jscrawler]
recipe = slapos.recipe.build:gitclone
repository = https://lab.nexedi.com/Mynij/mynij-crawler.git
revision = ccbdfdc4712c008034b891f081be92b9342c48ac
git-executable = ${git:location}/bin/git
[jscrawler-build]
recipe = plone.recipe.command
stop-on-error = true
command =
cd ${jscrawler:location} && PATH=${nodejs:location}/bin:$PATH npm install
update-command = ${:command}
[download-template]
recipe = slapos.recipe.build:download
url = ${:_profile_base_location_}/${:filename}
mode = 0644
[instance]
recipe = slapos.recipe.template:jinja2
rendered = ${buildout:directory}/instance.cfg
template = ${:_profile_base_location_}/${:filename}
mode = 0644
context =
key bash_location bash:location
key bin_directory buildout:bin-directory
key buildout_egg_directory buildout:eggs-directory
key buildout_develop_directory buildout:develop-eggs-directory
key buildout_directory buildout:directory
key template_monitor_cfg monitor2-template:rendered
key logrotate_cfg template-logrotate-base:rendered
key jscrawler_location jscrawler:location
key nodejs_location nodejs:location
key openssl_location openssl:location
key template_jscrawler template-jscrawler:target
key template_jscrawler_builder template-jscrawler-builder:target
[template-jscrawler]
<= download-template
output = ${buildout:directory}/instance-jscrawler.cfg.jinja2
[template-jscrawler-builder]
<= download-template
output = ${buildout:directory}/template-jscrawler.builder.sh.in
{
"name": "JSCrawler",
"description": "JSCrawler",
"serialisation": "xml",
"software-type": {
"default": {
"title": "Default",
"description": "JSCrawler",
"request": "instance-jscrawler-input-schema.json",
"response": "instance-jscrawler-output-schema.json",
"index": 0
}
}
}
#!/bin/bash
URLS="{{ url_list }}"
OUTPUT_DIR="{{ public_folder }}"
TMP_DIR="{{ tmp_folder }}"
PERIOD="{{ period }}"
if [ -s "{{ pid_file}}" ]; then
echo "Crawler process already running with pid `cat {{ pid_file}}`"
exit 1
fi
trap "rm -f -- '{{ pid_file}}'" EXIT
echo $$ > "{{ pid_file}}"
crawl() {
{{ jscrawler_wrapper }} -f $TMP_OUTPUT --link $1
if [ -s "$2" ]; then
mv $2 $3
fi
}
check_crawl() {
if [ -z "$PERIOD" ] || [ "$PERIOD" -eq 0 ] ; then
echo "Already crawled $1... SKIPPED"
return 0;
fi
sitemap=$3
tmp=$2
url=$1
NOW=$(date +"%s")
T=$(stat -c %Y $sitemap)
I=$((T+86400*PERIOD))
diff=$((NOW-I))
if [ "$diff" -gt 0 ]; then
crawl $url $tmp $sitemap
else
echo "Already crawled $url... SKIPPED"
fi
}
for url in `echo $URLS`
do
# Name is the URL domain
NAME=$(echo "$url" | cut -d'/' -f3)
# Add path is it exists in URL
NAME="$NAME$(echo $url | grep -Po '\w\K/\w+[^?]+' | sed -r 's/\/+/_/g')"
echo "Checking $NAME..."
OUTPUT="$OUTPUT_DIR/$NAME.xml"
TMP_OUTPUT="$TMP_DIR/$NAME.xml"
if [ -s "$OUTPUT" ]; then
check_crawl $url $TMP_OUTPUT $OUTPUT
else
crawl $url $TMP_OUTPUT $OUTPUT
fi
done
Tests for JsCrawler software release
##############################################################################
#
# Copyright (c) 2018 Nexedi SA and Contributors. All Rights Reserved.
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsibility of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# guarantees and support are strongly adviced to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 3
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
##############################################################################
from setuptools import setup, find_packages
version = '0.0.1.dev0'
name = 'slapos.test.jscrawler'
long_description = open("README.md").read()
setup(
name=name,
version=version,
description="Test for SlapOS' JSCrawler",
long_description=long_description,
long_description_content_type='text/markdown',
maintainer="Nexedi",
maintainer_email="info@nexedi.com",
url="https://lab.nexedi.com/nexedi/slapos",
packages=find_packages(),
install_requires=[
'slapos.core',
'slapos.libnetworkcache',
'erp5.util',
'supervisor',
'requests',
],
zip_safe=True,
test_suite='test',
)
##############################################################################
#
# Copyright (c) 2019 Nexedi SA and Contributors. All Rights Reserved.
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsibility of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# guarantees and support are strongly adviced to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 3
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
##############################################################################
import os
import logging
from six.moves.urllib.parse import urlparse
import requests
import time
from slapos.testing.testcase import makeModuleSetUpAndTestCaseClass
from slapos.testing.utils import ManagedHTTPServer
setUpModule, SlapOSInstanceTestCase = makeModuleSetUpAndTestCaseClass(
os.path.abspath(
os.path.join(os.path.dirname(__file__), '..', 'software.cfg')))
class TestJSCrawler(SlapOSInstanceTestCase):
@classmethod
def getInstanceParameterDict(cls):
class TestServer(ManagedHTTPServer):
def do_GET(self):
# type: () -> None
self.send_response(200)
self.send_header("Content-Type", "application/html")
self.end_headers()
self.wfile.write('<title>Hello {}</title>'.format(self._name))
return {
'urls': '\n'.join([
cls.getManagedResource('website1', TestServer).url,
cls.getManagedResource('website2', TestServer).url,
])
}
def setUp(self):
self.url = self.computer_partition.getConnectionParameterDict(
)['url']
def test_http_get(self):
resp = requests.get(self.url, verify=False)
self.assertTrue(
resp.status_code in [requests.codes.ok, requests.codes.found])
def test_crawled_sitemap(self):
url_list = self.computer_partition.getInstanceParameterDict()['urls'].split('\n')
time.sleep(70) # wait until cron runs
website1 = urlparse(url_list[0]).netloc
sitemap1 = requests.get(self.url + '/%s.xml' % website1, verify=False)
self.assertEqual(sitemap1.status_code, requests.codes.ok)
website2 = urlparse(url_list[1]).netloc
sitemap2 = requests.get(self.url + '/%s.xml' % website2, verify=False)
self.assertEqual(sitemap2.status_code, requests.codes.ok)
......@@ -162,6 +162,11 @@ setup = ${slapos-repository:location}/software/dream/test/
egg = slapos.test.repman
setup = ${slapos-repository:location}/software/repman/test/
[slapos.test.jscrawler-setup]
<= setup-develop-egg
egg = slapos.test.jscrawler
setup = ${slapos-repository:location}/software/jscrawler/test/
[slapos.core-repository]
<= git-clone-repository
repository = https://lab.nexedi.com/nexedi/slapos.core.git
......@@ -215,6 +220,7 @@ extra-eggs =
${slapos.test.metabase-setup:egg}
${slapos.test.repman-setup:egg}
${slapos.test.caucase-setup:egg}
${slapos.test.jscrawler-setup:egg}
# We don't name this interpreter `python`, so that when we run slapos node
# software, installation scripts running `python` use a python without any
......@@ -285,6 +291,7 @@ extra =
cloudooo ${slapos.test.cloudooo-setup:setup}
dream ${slapos.test.dream-setup:setup}
caucase ${slapos.test.caucase-setup:setup}
jscrawler ${slapos.test.jscrawler-setup:setup}
[versions]
# slapos.core is used from the clone always
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment