Commit 9238195c authored by Xiaowu Zhang's avatar Xiaowu Zhang

jscrawler: extend crawler to check site

parent 70db124b
...@@ -15,12 +15,16 @@ ...@@ -15,12 +15,16 @@
[instance] [instance]
filename = instance.cfg.in filename = instance.cfg.in
md5sum = 7333d1dfd4e8e4c375f7f1748292f554 md5sum = 5b5c740ed6f30e8a058b8e767840bf6f
[template-jscrawler] [template-jscrawler]
filename = instance-jscrawler.cfg.jinja2.in filename = instance-jscrawler.cfg.jinja2.in
md5sum = ae9269ab5b1cce77016f822024d2d996 md5sum = c955e6be5d5902bbaa0daedf09920ce7
[template-jscrawler]
filename = instance-jscrawler.cfg.jinja2.in
md5sum = c955e6be5d5902bbaa0daedf09920ce7
[template-jscrawler-builder] [template-jscrawler-builder]
filename = template-jscrawler.builder.sh.in filename = template-jscrawler.builder.sh.in
md5sum = c5e8f8b3983d5f572a564b34fa0f7499 md5sum = f8100800cda3532ee4cf987b34bd2c13
{% set part_list = [] -%}
{% macro section(name) %}{% do part_list.append(name) %}{{ name }}{% endmacro -%}
{% set site_list = slapparameter_dict.get('urls', "").split("\n") -%}
[directory] [directory]
recipe = slapos.cookbook:mkdirectory recipe = slapos.cookbook:mkdirectory
etc = ${buildout:directory}/etc etc = ${buildout:directory}/etc
...@@ -77,12 +81,15 @@ output = ${directory:bin}/jscrawler-build ...@@ -77,12 +81,15 @@ output = ${directory:bin}/jscrawler-build
extensions = jinja2.ext.do extensions = jinja2.ext.do
list = {{ slapparameter_dict.get('urls', "").split("\n") | join('\n ') }} list = {{ slapparameter_dict.get('urls', "").split("\n") | join('\n ') }}
period = {{ slapparameter_dict.get('crawl-periodicity', 0) }} period = {{ slapparameter_dict.get('crawl-periodicity', 0) }}
depth = {{ slapparameter_dict.get('crawl-depth', 3) }}
context = context =
key public_folder directory:www key public_folder directory:www
key tmp_folder directory:tmp key tmp_folder directory:tmp
key jscrawler_wrapper jscrawler-wrapper:wrapper-path key jscrawler_wrapper jscrawler-wrapper:wrapper-path
key period :period key period :period
key url_list :list key url_list :list
key depth :depth
raw shell_binary {{ bash_executable_location }} raw shell_binary {{ bash_executable_location }}
raw pid_file ${directory:run}/jscrawler.pid raw pid_file ${directory:run}/jscrawler.pid
...@@ -94,6 +101,16 @@ frequency = * * * * * ...@@ -94,6 +101,16 @@ frequency = * * * * *
command = ${jscrawler-build-wrapper:output} command = ${jscrawler-build-wrapper:output}
{% for site in site_list %}
{% set new_site = site.split('//')[1].strip() -%}
[{{ section('check-http-header-for-' + new_site) }}]
<= monitor-promise-base
promise = check_site_state
name = {{ 'check-site-state-for-' + new_site.replace('.', '-') + '.py'}}
config-site-state-file = ${directory:www}/{{ new_site + '.xml'}}
{% endfor %}
[publish-connection-information] [publish-connection-information]
<= monitor-publish <= monitor-publish
recipe = slapos.cookbook:publish recipe = slapos.cookbook:publish
...@@ -110,6 +127,7 @@ parts = ...@@ -110,6 +127,7 @@ parts =
httpd-wrapper httpd-wrapper
httpd-listen-promise httpd-listen-promise
jscrawler-frontend-promise jscrawler-frontend-promise
{{ part_list | join('\n ') }}
eggs-directory = {{ eggs_directory }} eggs-directory = {{ eggs_directory }}
develop-eggs-directory = {{ develop_eggs_directory }} develop-eggs-directory = {{ develop_eggs_directory }}
......
[buildout] [buildout]
develop = /srv/slapgrid/slappart78/srv/project/slapos.toolbox
parts = switch-softwaretype parts = switch-softwaretype
# std stuff for slapos instance # std stuff for slapos instance
......
...@@ -19,12 +19,15 @@ parts = ...@@ -19,12 +19,15 @@ parts =
[nodejs] [nodejs]
<= nodejs-12.18.3 <= nodejs-12.18.3
[jscrawler] [jscrawlerxxxx]
recipe = slapos.recipe.build:gitclone recipe = slapos.recipe.build:gitclone
repository = https://lab.nexedi.com/Mynij/mynij-crawler.git repository = https://lab.nexedi.com/Mynij/mynij-crawler.git
revision = ccbdfdc4712c008034b891f081be92b9342c48ac revision = ccbdfdc4712c008034b891f081be92b9342c48ac
git-executable = ${git:location}/bin/git git-executable = ${git:location}/bin/git
[jscrawler]
location = /srv/slapgrid/slappart78/srv/project/mynij-crawler
[jscrawler-build] [jscrawler-build]
recipe = plone.recipe.command recipe = plone.recipe.command
......
...@@ -5,6 +5,7 @@ URLS="{{ url_list }}" ...@@ -5,6 +5,7 @@ URLS="{{ url_list }}"
OUTPUT_DIR="{{ public_folder }}" OUTPUT_DIR="{{ public_folder }}"
TMP_DIR="{{ tmp_folder }}" TMP_DIR="{{ tmp_folder }}"
PERIOD="{{ period }}" PERIOD="{{ period }}"
DEPTH="{{ depth }}"
if [ -s "{{ pid_file}}" ]; then if [ -s "{{ pid_file}}" ]; then
echo "Crawler process already running with pid `cat {{ pid_file}}`" echo "Crawler process already running with pid `cat {{ pid_file}}`"
...@@ -16,7 +17,7 @@ trap "rm -f -- '{{ pid_file}}'" EXIT ...@@ -16,7 +17,7 @@ trap "rm -f -- '{{ pid_file}}'" EXIT
echo $$ > "{{ pid_file}}" echo $$ > "{{ pid_file}}"
crawl() { crawl() {
{{ jscrawler_wrapper }} -f $TMP_OUTPUT --link $1 {{ jscrawler_wrapper }} -f $TMP_OUTPUT --link $1 -d $4
if [ -s "$2" ]; then if [ -s "$2" ]; then
mv $2 $3 mv $2 $3
fi fi
...@@ -35,7 +36,7 @@ check_crawl() { ...@@ -35,7 +36,7 @@ check_crawl() {
I=$((T+86400*PERIOD)) I=$((T+86400*PERIOD))
diff=$((NOW-I)) diff=$((NOW-I))
if [ "$diff" -gt 0 ]; then if [ "$diff" -gt 0 ]; then
crawl $url $tmp $sitemap crawl $url $tmp $sitemap $4
else else
echo "Already crawled $url... SKIPPED" echo "Already crawled $url... SKIPPED"
fi fi
...@@ -52,8 +53,8 @@ do ...@@ -52,8 +53,8 @@ do
TMP_OUTPUT="$TMP_DIR/$NAME.xml" TMP_OUTPUT="$TMP_DIR/$NAME.xml"
if [ -s "$OUTPUT" ]; then if [ -s "$OUTPUT" ]; then
check_crawl $url $TMP_OUTPUT $OUTPUT check_crawl $url $TMP_OUTPUT $OUTPUT $DEPTH
else else
crawl $url $TMP_OUTPUT $OUTPUT crawl $url $TMP_OUTPUT $OUTPUT $DEPTH
fi fi
done done
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment