Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
slapos
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Tom Niget
slapos
Commits
19283f5c
Commit
19283f5c
authored
Feb 12, 2021
by
Alain Takoudjou
Browse files
Options
Browse Files
Download
Plain Diff
Jscrawler: Javascript Nodejs website crawler
See merge request
nexedi/slapos!904
parents
7bc19d84
48ae09ff
Changes
12
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
528 additions
and
0 deletions
+528
-0
software/jscrawler/buildout.hash.cfg
software/jscrawler/buildout.hash.cfg
+26
-0
software/jscrawler/instance-jscrawler-input-schema.json
software/jscrawler/instance-jscrawler-input-schema.json
+37
-0
software/jscrawler/instance-jscrawler-output-schema.json
software/jscrawler/instance-jscrawler-output-schema.json
+11
-0
software/jscrawler/instance-jscrawler.cfg.jinja2.in
software/jscrawler/instance-jscrawler.cfg.jinja2.in
+118
-0
software/jscrawler/instance.cfg.in
software/jscrawler/instance.cfg.in
+55
-0
software/jscrawler/software.cfg
software/jscrawler/software.cfg
+67
-0
software/jscrawler/software.cfg.json
software/jscrawler/software.cfg.json
+14
-0
software/jscrawler/template-jscrawler.builder.sh.in
software/jscrawler/template-jscrawler.builder.sh.in
+59
-0
software/jscrawler/test/README.md
software/jscrawler/test/README.md
+1
-0
software/jscrawler/test/setup.py
software/jscrawler/test/setup.py
+52
-0
software/jscrawler/test/test.py
software/jscrawler/test/test.py
+81
-0
software/slapos-sr-testing/software.cfg
software/slapos-sr-testing/software.cfg
+7
-0
No files found.
software/jscrawler/buildout.hash.cfg
0 → 100644
View file @
19283f5c
# THIS IS NOT A BUILDOUT FILE, despite purposedly using a compatible syntax.
# The only allowed lines here are (regexes):
# - "^#" comments, copied verbatim
# - "^[" section beginings, copied verbatim
# - lines containing an "=" sign which must fit in the following categorie.
# - "^\s*filename\s*=\s*path\s*$" where "path" is relative to this file
# Copied verbatim.
# - "^\s*hashtype\s*=.*" where "hashtype" is one of the values supported
# by the re-generation script.
# Re-generated.
# - other lines are copied verbatim
# Substitution (${...:...}), extension ([buildout] extends = ...) and
# section inheritance (< = ...) are NOT supported (but you should really
# not need these here).
[instance]
md5sum = 6c17361a49cfc47564063b867aab6e8c
filename = instance.cfg.in
[template-jscrawler]
md5sum = 8fa425275120e8ba5c466aff00b48e7b
filename = instance-jscrawler.cfg.jinja2.in
[template-jscrawler-builder]
md5sum = c5e8f8b3983d5f572a564b34fa0f7499
filename = template-jscrawler.builder.sh.in
software/jscrawler/instance-jscrawler-input-schema.json
0 → 100644
View file @
19283f5c
{
"type"
:
"object"
,
"$schema"
:
"http://json-schema.org/draft-06/schema"
,
"title"
:
"Input Parameters"
,
"properties"
:
{
"urls"
:
{
"title"
:
"List of website URL to crawl (newline separated)."
,
"description"
:
"List of website URL to crawl (newline separated)."
,
"type"
:
"string"
,
"textarea"
:
true
,
"default"
:
""
},
"crawl-periodicity"
:
{
"title"
:
"Number of days periodicity before re-crawl a site."
,
"description"
:
"Number of days before next crawl of URL will start and regenerate the sitemap. Set 0 to disable re-crawl."
,
"type"
:
"integer"
,
"default"
:
0
,
"enum"
:
[
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
15
,
20
,
30
,
60
]
}
}
}
software/jscrawler/instance-jscrawler-output-schema.json
0 → 100644
View file @
19283f5c
{
"name"
:
"Output Parameters"
,
"properties"
:
{
"url"
:
{
"title"
:
"Frontend http server URL"
,
"description"
:
"Frontend http server URL for sitemap download."
,
"type"
:
"string"
,
"format"
:
"uri"
}
}
}
software/jscrawler/instance-jscrawler.cfg.jinja2.in
0 → 100644
View file @
19283f5c
[directory]
recipe = slapos.cookbook:mkdirectory
etc = ${buildout:directory}/etc
bin = ${buildout:directory}/bin
srv = ${buildout:directory}/srv
var = ${buildout:directory}/var
run = ${:var}/run
log = ${:var}/log
scripts = ${:etc}/run
services = ${:etc}/service
plugins = ${:etc}/plugin
ssl = ${:etc}/ssl
www = ${:srv}/www
tmp = ${:srv}/tmp
#################################
# httpd service
#################################
[gen-certificate]
recipe = plone.recipe.command
command = "{{ parameter_dict['openssl'] }}" req -newkey rsa -batch -new -x509 -days 3650 -nodes -keyout "${:ssl_key}" -out "${:ssl_crt}"
stop-on-error = true
ssl_crt = ${directory:ssl}/httpd.crt
ssl_key = ${directory:ssl}/httpd.key
[httpd-wrapper]
recipe = slapos.cookbook:simplehttpserver
host = {{ (ipv6 | list)[0] }}
port = 9083
base-path = ${directory:www}
wrapper = ${directory:services}/http-server
log-file = ${directory:log}/httpd.log
use-hash-url = false
cert-file = ${gen-certificate:ssl_crt}
key-file = ${gen-certificate:ssl_key}
[request-jscrawler-frontend]
<= slap-connection
recipe = slapos.cookbook:requestoptional
name = jscawler Frontend
# XXX We have hardcoded SR URL here.
software-url = http://git.erp5.org/gitweb/slapos.git/blob_plain/HEAD:/software/apache-frontend/software.cfg
slave = true
config-url = https://[${httpd-wrapper:host}]:${httpd-wrapper:port}
return = secure_access domain
[jscrawler-frontend-promise]
<= monitor-promise-base
module = check_url_available
name = jscrawler_frontend.py
config-url = ${request-jscrawler-frontend:connection-secure_access}
config-check-secure = 1
[logrotate-entry-httpd]
<= logrotate-entry-base
name = http-server
log = ${httpd-wrapper:log-file}
[httpd-listen-promise]
<= monitor-promise-base
module = check_port_listening
name = httpd-listen.py
config-hostname = ${httpd-wrapper:host}
config-port = ${httpd-wrapper:port}
[jscrawler-wrapper]
recipe = slapos.cookbook:wrapper
command-line =
{{ parameter_dict['nodejs-location'] }} {{ parameter_dict['jscrawler-location'] }}
wrapper-path = ${directory:bin}/jscrawler
[jscrawler-build-wrapper]
recipe = slapos.recipe.template:jinja2
template = {{ parameter_dict['template-jscrawler'] }}
rendered = ${directory:bin}/jscrawler-build
extensions = jinja2.ext.do
mode = 0700
list = {{ slapparameter_dict.get('urls', "").split("\n") | join('\n ') }}
period = {{ slapparameter_dict.get('crawl-periodicity', 0) }}
context =
key public_folder directory:www
key tmp_folder directory:tmp
key jscrawler_wrapper jscrawler-wrapper:wrapper-path
key period :period
key url_list :list
raw shell_binary {{ bash_executable_location }}
raw pid_file ${directory:run}/jscrawler.pid
[cron-entry-build-sitemap]
<= cron
recipe = slapos.cookbook:cron.d
name = jscrawler-build
frequency = * * * * *
command = ${jscrawler-build-wrapper:rendered}
[publish-connection-information]
<= monitor-publish
recipe = slapos.cookbook:publish.serialised
url = ${request-jscrawler-frontend:connection-secure_access}
[buildout]
extends = {{ template_monitor }}
parts =
publish-connection-information
logrotate-entry-httpd
# crawler cron
cron-entry-build-sitemap
httpd-wrapper
httpd-listen-promise
jscrawler-frontend-promise
eggs-directory = {{ eggs_directory }}
develop-eggs-directory = {{ develop_eggs_directory }}
offline = true
software/jscrawler/instance.cfg.in
0 → 100644
View file @
19283f5c
[buildout]
parts = switch-softwaretype
# std stuff for slapos instance
eggs-directory = {{ buildout_egg_directory }}
develop-eggs-directory = {{ buildout_develop_directory }}
offline = true
[switch-softwaretype]
recipe = slapos.cookbook:switch-softwaretype
default = dynamic-template-jscrawler:rendered
RootSoftwareInstance = ${:default}
[slap-configuration]
recipe = slapos.cookbook:slapconfiguration.serialised
computer = ${slap-connection:computer-id}
partition = ${slap-connection:partition-id}
url = ${slap-connection:server-url}
key = ${slap-connection:key-file}
cert = ${slap-connection:cert-file}
[jinja2-template-base]
recipe = slapos.recipe.template:jinja2
rendered = ${buildout:directory}/${:filename}
extensions = jinja2.ext.do
mode = 0644
extra-context =
context =
key develop_eggs_directory buildout:develop-eggs-directory
key buildout_directory buildout:directory
key eggs_directory buildout:eggs-directory
key ipv4 slap-configuration:ipv4
key ipv6 slap-configuration:ipv6
key global_ipv4_prefix network-information:global-ipv4-network
key slapparameter_dict slap-configuration:configuration
key computer_id slap-configuration:computer
raw bash_executable_location {{ bash_location }}/bin/dash
raw template_monitor {{ template_monitor_cfg }}
raw logrotate_cfg {{ logrotate_cfg }}
${:extra-context}
[dynamic-template-jscrawler-parameters]
openssl = {{ openssl_location }}/bin/openssl
jscrawler-location = {{ jscrawler_location }}/crawler.js
nodejs-location = {{ nodejs_location }}/bin/node
template-jscrawler = {{ template_jscrawler_builder }}
[dynamic-template-jscrawler]
<= jinja2-template-base
template = {{ template_jscrawler }}
filename = instance-jscrawler.cfg
extra-context =
section parameter_dict dynamic-template-jscrawler-parameters
software/jscrawler/software.cfg
0 → 100644
View file @
19283f5c
[buildout]
extends =
../../component/bash/buildout.cfg
../../component/openssl/buildout.cfg
../../component/curl/buildout.cfg
../../component/git/buildout.cfg
../../stack/slapos.cfg
../../stack/nodejs.cfg
../../stack/monitor/buildout.cfg
./buildout.hash.cfg
parts =
slapos-cookbook
nodejs
openssl
jscrawler-build
instance
[nodejs]
<= nodejs-12.18.3
[jscrawler]
recipe = slapos.recipe.build:gitclone
repository = https://lab.nexedi.com/Mynij/mynij-crawler.git
revision = ccbdfdc4712c008034b891f081be92b9342c48ac
git-executable = ${git:location}/bin/git
[jscrawler-build]
recipe = plone.recipe.command
stop-on-error = true
command =
cd ${jscrawler:location} && PATH=${nodejs:location}/bin:$PATH npm install
update-command = ${:command}
[download-template]
recipe = slapos.recipe.build:download
url = ${:_profile_base_location_}/${:filename}
mode = 0644
[instance]
recipe = slapos.recipe.template:jinja2
rendered = ${buildout:directory}/instance.cfg
template = ${:_profile_base_location_}/${:filename}
mode = 0644
context =
key bash_location bash:location
key bin_directory buildout:bin-directory
key buildout_egg_directory buildout:eggs-directory
key buildout_develop_directory buildout:develop-eggs-directory
key buildout_directory buildout:directory
key template_monitor_cfg monitor2-template:rendered
key logrotate_cfg template-logrotate-base:rendered
key jscrawler_location jscrawler:location
key nodejs_location nodejs:location
key openssl_location openssl:location
key template_jscrawler template-jscrawler:target
key template_jscrawler_builder template-jscrawler-builder:target
[template-jscrawler]
<= download-template
output = ${buildout:directory}/instance-jscrawler.cfg.jinja2
[template-jscrawler-builder]
<= download-template
output = ${buildout:directory}/template-jscrawler.builder.sh.in
software/jscrawler/software.cfg.json
0 → 100644
View file @
19283f5c
{
"name"
:
"JSCrawler"
,
"description"
:
"JSCrawler"
,
"serialisation"
:
"xml"
,
"software-type"
:
{
"default"
:
{
"title"
:
"Default"
,
"description"
:
"JSCrawler"
,
"request"
:
"instance-jscrawler-input-schema.json"
,
"response"
:
"instance-jscrawler-output-schema.json"
,
"index"
:
0
}
}
}
software/jscrawler/template-jscrawler.builder.sh.in
0 → 100644
View file @
19283f5c
#!/bin/bash
URLS
=
"{{ url_list }}"
OUTPUT_DIR
=
"{{ public_folder }}"
TMP_DIR
=
"{{ tmp_folder }}"
PERIOD
=
"{{ period }}"
if
[
-s
"{{ pid_file}}"
]
;
then
echo
"Crawler process already running with pid
`
cat
{{
pid_file
}}
`
"
exit
1
fi
trap
"rm -f -- '{{ pid_file}}'"
EXIT
echo
$$
>
"{{ pid_file}}"
crawl
()
{
{{
jscrawler_wrapper
}}
-f
$TMP_OUTPUT
--link
$1
if
[
-s
"
$2
"
]
;
then
mv
$2
$3
fi
}
check_crawl
()
{
if
[
-z
"
$PERIOD
"
]
||
[
"
$PERIOD
"
-eq
0
]
;
then
echo
"Already crawled
$1
... SKIPPED"
return
0
;
fi
sitemap
=
$3
tmp
=
$2
url
=
$1
NOW
=
$(
date
+
"%s"
)
T
=
$(
stat
-c
%Y
$sitemap
)
I
=
$((
T+86400
*
PERIOD
))
diff
=
$((
NOW-I
))
if
[
"
$diff
"
-gt
0
]
;
then
crawl
$url
$tmp
$sitemap
else
echo
"Already crawled
$url
... SKIPPED"
fi
}
for
url
in
`
echo
$URLS
`
do
# Name is the URL domain
NAME
=
$(
echo
"
$url
"
|
cut
-d
'/'
-f3
)
# Add path is it exists in URL
NAME
=
"
$NAME
$(
echo
$url
|
grep
-Po
'\w\K/\w+[^?]+'
|
sed
-r
's/\/+/_/g'
)
"
echo
"Checking
$NAME
..."
OUTPUT
=
"
$OUTPUT_DIR
/
$NAME
.xml"
TMP_OUTPUT
=
"
$TMP_DIR
/
$NAME
.xml"
if
[
-s
"
$OUTPUT
"
]
;
then
check_crawl
$url
$TMP_OUTPUT
$OUTPUT
else
crawl
$url
$TMP_OUTPUT
$OUTPUT
fi
done
software/jscrawler/test/README.md
0 → 100644
View file @
19283f5c
Tests for JsCrawler software release
software/jscrawler/test/setup.py
0 → 100644
View file @
19283f5c
##############################################################################
#
# Copyright (c) 2018 Nexedi SA and Contributors. All Rights Reserved.
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsibility of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# guarantees and support are strongly adviced to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 3
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
##############################################################################
from
setuptools
import
setup
,
find_packages
version
=
'0.0.1.dev0'
name
=
'slapos.test.jscrawler'
long_description
=
open
(
"README.md"
).
read
()
setup
(
name
=
name
,
version
=
version
,
description
=
"Test for SlapOS' JSCrawler"
,
long_description
=
long_description
,
long_description_content_type
=
'text/markdown'
,
maintainer
=
"Nexedi"
,
maintainer_email
=
"info@nexedi.com"
,
url
=
"https://lab.nexedi.com/nexedi/slapos"
,
packages
=
find_packages
(),
install_requires
=
[
'slapos.core'
,
'slapos.libnetworkcache'
,
'erp5.util'
,
'supervisor'
,
'requests'
,
],
zip_safe
=
True
,
test_suite
=
'test'
,
)
software/jscrawler/test/test.py
0 → 100644
View file @
19283f5c
##############################################################################
#
# Copyright (c) 2019 Nexedi SA and Contributors. All Rights Reserved.
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsibility of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# guarantees and support are strongly adviced to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 3
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
##############################################################################
import
os
import
logging
from
six.moves.urllib.parse
import
urlparse
import
requests
import
time
from
slapos.testing.testcase
import
makeModuleSetUpAndTestCaseClass
from
slapos.testing.utils
import
ManagedHTTPServer
setUpModule
,
SlapOSInstanceTestCase
=
makeModuleSetUpAndTestCaseClass
(
os
.
path
.
abspath
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
'..'
,
'software.cfg'
)))
class
TestJSCrawler
(
SlapOSInstanceTestCase
):
@
classmethod
def
getInstanceParameterDict
(
cls
):
class
TestServer
(
ManagedHTTPServer
):
def
do_GET
(
self
):
# type: () -> None
self
.
send_response
(
200
)
self
.
send_header
(
"Content-Type"
,
"application/html"
)
self
.
end_headers
()
self
.
wfile
.
write
(
'<title>Hello {}</title>'
.
format
(
self
.
_name
))
return
{
'urls'
:
'
\
n
'
.
join
([
cls
.
getManagedResource
(
'website1'
,
TestServer
).
url
,
cls
.
getManagedResource
(
'website2'
,
TestServer
).
url
,
])
}
def
setUp
(
self
):
self
.
url
=
self
.
computer_partition
.
getConnectionParameterDict
(
)[
'url'
]
def
test_http_get
(
self
):
resp
=
requests
.
get
(
self
.
url
,
verify
=
False
)
self
.
assertTrue
(
resp
.
status_code
in
[
requests
.
codes
.
ok
,
requests
.
codes
.
found
])
def
test_crawled_sitemap
(
self
):
url_list
=
self
.
computer_partition
.
getInstanceParameterDict
()[
'urls'
].
split
(
'
\
n
'
)
time
.
sleep
(
70
)
# wait until cron runs
website1
=
urlparse
(
url_list
[
0
]).
netloc
sitemap1
=
requests
.
get
(
self
.
url
+
'/%s.xml'
%
website1
,
verify
=
False
)
self
.
assertEqual
(
sitemap1
.
status_code
,
requests
.
codes
.
ok
)
website2
=
urlparse
(
url_list
[
1
]).
netloc
sitemap2
=
requests
.
get
(
self
.
url
+
'/%s.xml'
%
website2
,
verify
=
False
)
self
.
assertEqual
(
sitemap2
.
status_code
,
requests
.
codes
.
ok
)
software/slapos-sr-testing/software.cfg
View file @
19283f5c
...
...
@@ -162,6 +162,11 @@ setup = ${slapos-repository:location}/software/dream/test/
egg = slapos.test.repman
setup = ${slapos-repository:location}/software/repman/test/
[slapos.test.jscrawler-setup]
<= setup-develop-egg
egg = slapos.test.jscrawler
setup = ${slapos-repository:location}/software/jscrawler/test/
[slapos.core-repository]
<= git-clone-repository
repository = https://lab.nexedi.com/nexedi/slapos.core.git
...
...
@@ -215,6 +220,7 @@ extra-eggs =
${slapos.test.metabase-setup:egg}
${slapos.test.repman-setup:egg}
${slapos.test.caucase-setup:egg}
${slapos.test.jscrawler-setup:egg}
# We don't name this interpreter `python`, so that when we run slapos node
# software, installation scripts running `python` use a python without any
...
...
@@ -285,6 +291,7 @@ extra =
cloudooo ${slapos.test.cloudooo-setup:setup}
dream ${slapos.test.dream-setup:setup}
caucase ${slapos.test.caucase-setup:setup}
jscrawler ${slapos.test.jscrawler-setup:setup}
[versions]
# slapos.core is used from the clone always
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment