Commit e80853a2 by Marco Mariani

hadoop, moved to stack

1 parent b9f5d141
#!/bin/bash
. environment.sh
DIR=var/gutenberg/raw-data
mkdir -p $DIR
wget -P $DIR -c http://www.gutenberg.org/cache/epub/103/pg103.txt
wget -P $DIR -c http://www.gutenberg.org/cache/epub/18857/pg18857.txt
wget -P $DIR -c http://www.gutenberg.org/cache/epub/2488/pg2488.txt
wget -P $DIR -c http://www.gutenberg.org/cache/epub/164/pg164.txt
wget -P $DIR -c http://www.gutenberg.org/cache/epub/1268/pg1268.txt
wget -P $DIR -c http://www.gutenberg.org/cache/epub/800/pg800.txt
wget -P $DIR -c http://www.gutenberg.org/cache/epub/4791/pg4791.txt
wget -P $DIR -c http://www.gutenberg.org/cache/epub/3526/pg3526.txt
wget -P $DIR -c http://www.gutenberg.org/cache/epub/2083/pg2083.txt
......@@ -5,12 +5,12 @@
source environment.sh
hdfs dfs -mkdir gutenberg
hdfs dfs -mkdir var/gutenberg/input
RAW_DATA=${buildout:directory}/software_release/gutenberg
RAW_DATA=var/gutenberg/raw-data
for file in `ls $RAW_DATA`; do
hdfs dfs -put $RAW_DATA/$file gutenberg/
hdfs dfs -put $RAW_DATA/$file var/gutenberg/input
done
......
#!/bin/bash
# http://www.michael-noll.com/tutorials/writing-an-hadoop-mapreduce-program-in-python/
. environment.sh
hadoop jar software_release/parts/hadoop-streaming/*jar \
-mapper demo/gutenberg/mapper.py \
-reducer demo/gutenberg/reducer.py \
-input var/gutenberg/input/* \
-output var/gutenberg/output
[buildout]
extends =
${instance-stack:output}
parts =
sh-environment
start-daemons
deploy-tar
gutenberg-data-download
gutenberg-mapper
gutenberg-reducer
gutenberg-run
gutenberg-put-files
wikipedia-data-download
wikipedia-mapper
wikipedia-reducer
wikipedia-run
wikipedia-put-files
[directories]
demo = $${buildout:directory}/demo
gutenberg = $${:demo}/gutenberg
wikipedia = $${:demo}/wikipedia
[gutenberg-data-download]
recipe = slapos.recipe.template
url = ${:_profile_base_location_}/gutenberg/data-download.sh
#md5sum =
output = $${directories:gutenberg}/data-download.sh
mode = 0755
[gutenberg-mapper]
recipe = slapos.recipe.template
url = ${:_profile_base_location_}/gutenberg/mapper.py
#md5sum =
output = $${directories:gutenberg}/mapper.py
mode = 0755
[gutenberg-reducer]
recipe = slapos.recipe.template
url = ${:_profile_base_location_}/gutenberg/reducer.py
#md5sum =
output = $${directories:gutenberg}/reducer.py
mode = 0755
[gutenberg-run]
recipe = slapos.recipe.template
url = ${:_profile_base_location_}/gutenberg/run.sh.in
#md5sum =
output = $${directories:gutenberg}/run.sh
mode = 0755
[gutenberg-put-files]
recipe = slapos.recipe.template
url = ${:_profile_base_location_}/gutenberg/put-files.sh.in
#md5sum =
output = $${directories:gutenberg}/put-files.sh
mode = 0755
[wikipedia-data-download]
recipe = slapos.recipe.template
url = ${:_profile_base_location_}/wikipedia/data-download.sh
#md5sum =
output = $${directories:wikipedia}/data-download.sh
mode = 0755
[wikipedia-mapper]
recipe = slapos.recipe.template
url = ${:_profile_base_location_}/wikipedia/mapper.py
#md5sum =
output = $${directories:wikipedia}/mapper.py
mode = 0755
[wikipedia-reducer]
recipe = slapos.recipe.template
url = ${:_profile_base_location_}/wikipedia/reducer.py
#md5sum =
output = $${directories:wikipedia}/reducer.py
mode = 0755
[wikipedia-run]
recipe = slapos.recipe.template
url = ${:_profile_base_location_}/wikipedia/run.sh.in
#md5sum =
output = $${directories:wikipedia}/run.sh
mode = 0755
[wikipedia-put-files]
recipe = slapos.recipe.template
url = ${:_profile_base_location_}/wikipedia/put-files.sh.in
#md5sum =
output = $${directories:wikipedia}/put-files.sh
mode = 0755
[buildout]
extends =
../../stack/hadoop/buildout.cfg
parts =
slapos-cookbook
eggs
java
hadoop
hadoop-streaming
instance-stack
instance
[instance]
recipe = slapos.recipe.template
url = ${:_profile_base_location_}/instance.cfg.in
output = ${buildout:directory}/instance.cfg
# md5sum =
mode = 0644
#!/bin/bash
. environment.sh
DIR=var/wikipedia/raw-data
mkdir -p $DIR
# http://dumps.wikimedia.org/enwiki/20140203/
# All pages, current versions only.
wget -P $DIR -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current1.xml-p000000010p000010000.bz2
wget -P $DIR -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current2.xml-p000010001p000025000.bz2
wget -P $DIR -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current3.xml-p000025001p000055000.bz2
wget -P $DIR -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current4.xml-p000055002p000104998.bz2
wget -P $DIR -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current5.xml-p000105001p000184999.bz2
wget -P $DIR -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current6.xml-p000185003p000305000.bz2
wget -P $DIR -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current7.xml-p000305002p000464997.bz2
wget -P $DIR -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current8.xml-p000465001p000665000.bz2
wget -P $DIR -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current9.xml-p000665001p000925000.bz2
# don't download the full dataset
# wget -P $DIR -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current10.xml-p000925001p001325000.bz2
# wget -P $DIR -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current11.xml-p001325001p001825000.bz2
# wget -P $DIR -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current12.xml-p001825001p002425000.bz2
# wget -P $DIR -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current13.xml-p002425001p003124998.bz2
# wget -P $DIR -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current14.xml-p003125001p003924999.bz2
# wget -P $DIR -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current15.xml-p003925001p004825000.bz2
# wget -P $DIR -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current16.xml-p004825002p006025000.bz2
# wget -P $DIR -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current17.xml-p006025001p007524997.bz2
# wget -P $DIR -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current18.xml-p007525002p009225000.bz2
# wget -P $DIR -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current19.xml-p009225001p011125000.bz2
# wget -P $DIR -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current20.xml-p011125001p013324998.bz2
# wget -P $DIR -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current21.xml-p013325001p015725000.bz2
# wget -P $DIR -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current22.xml-p015725003p018225000.bz2
# wget -P $DIR -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current23.xml-p018225001p020925000.bz2
# wget -P $DIR -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current24.xml-p020925002p023725000.bz2
# wget -P $DIR -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current25.xml-p023725001p026624999.bz2
# wget -P $DIR -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current26.xml-p026625002p029625000.bz2
# wget -P $DIR -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current27.xml-p029625001p041836446.bz2
#!/usr/bin/env python
import bz2
import os
......@@ -39,23 +40,7 @@ def process_xml(input):
parser.parse(input)
if __name__ == '__main__':
input = bz2.BZ2File('/dev/fd/0')
process_xml(input)
# dirname = '/srv/slapgrid/slappart20/srv/runner/instance/slappart0/software_release/raw-data/'
# filenames = os.listdir(dirname)
# # ['enwiki-20140203-pages-meta-current1.xml-p000000010p000010000.bz2']
# for fname in filenames:
# process_xml(os.path.join(dirname, fname))
# input = bz2.BZ2File(process_xml(os.path.join(dirname, fname)))
#!/bin/bash
# exit on error
# set -e
source environment.sh
hdfs dfs -mkdir var/wikipedia/input
RAW_DATA=var/wikipedia/raw-data
for file in `ls $RAW_DATA`; do
hdfs dfs -put $RAW_DATA/$file var/wikipedia/input
done
#!/bin/bash
. environment.sh
hadoop jar software_release/parts/hadoop-streaming/*jar \
-mapper demo/wikipedia/mapper.py \
-reducer demo/wikipedia/reducer.py \
-input var/wikipedia/input/* \
-output var/wikipedia/output
[buildout]
extends =
../../stack/slapos.cfg
../../component/java/buildout.cfg
parts =
slapos-cookbook
eggs
java
hadoop
hadoop-streaming
gutenberg-dataset
instance
[eggs]
recipe = zc.recipe.egg
eggs =
slapos.cookbook
collective.recipe.template
cp.recipe.cmd
plone.recipe.command
[hadoop]
recipe = hexagonit.recipe.download
filename = hadoop-2.2.0.tar.gz
url = http://apache.mirrors.spacedump.net/hadoop/common/stable/${:filename}
md5sum = 25f27eb0b5617e47c032319c0bfd9962
download-only = true
mode = 0644
strip-top-level-dir = true
[hadoop-streaming]
recipe = hexagonit.recipe.download
url = http://repo1.maven.org/maven2/org/apache/hadoop/hadoop-streaming/0.20.203.0/hadoop-streaming-0.20.203.0.jar
download-only = true
#md5sum =
mode = 0644
[instance]
recipe = slapos.recipe.template
url = ${:_profile_base_location_}/instance.cfg.in
output = ${buildout:directory}/instance.cfg
# md5sum =
mode = 0644
[gutenberg-dataset]
recipe = cp.recipe.cmd
update_cmd = /bin/true
install_cmd =
mkdir -p ${buildout:directory}/gutenberg
cd ${buildout:directory}/gutenberg
wget -c http://www.gutenberg.org/cache/epub/103/pg103.txt
wget -c http://www.gutenberg.org/cache/epub/18857/pg18857.txt
wget -c http://www.gutenberg.org/cache/epub/2488/pg2488.txt
wget -c http://www.gutenberg.org/cache/epub/164/pg164.txt
wget -c http://www.gutenberg.org/cache/epub/1268/pg1268.txt
wget -c http://www.gutenberg.org/cache/epub/800/pg800.txt
wget -c http://www.gutenberg.org/cache/epub/4791/pg4791.txt
wget -c http://www.gutenberg.org/cache/epub/3526/pg3526.txt
wget -c http://www.gutenberg.org/cache/epub/2083/pg2083.txt
#[wikipedia-dataset]
#recipe = cp.recipe.cmd
#update_cmd = /bin/true
##update_cmd = ${:install_cmd}
#install_cmd =
# mkdir -p ${buildout:directory}/raw-data
# cd ${buildout:directory}/raw-data
# wget -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current1.xml-p000000010p000010000.bz2
# wget -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current2.xml-p000010001p000025000.bz2
# wget -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current3.xml-p000025001p000055000.bz2
# wget -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current4.xml-p000055002p000104998.bz2
# wget -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current5.xml-p000105001p000184999.bz2
# wget -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current6.xml-p000185003p000305000.bz2
# wget -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current7.xml-p000305002p000464997.bz2
# wget -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current8.xml-p000465001p000665000.bz2
# wget -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current9.xml-p000665001p000925000.bz2
# wget -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current10.xml-p000925001p001325000.bz2
# wget -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current11.xml-p001325001p001825000.bz2
# wget -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current12.xml-p001825001p002425000.bz2
# wget -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current13.xml-p002425001p003124998.bz2
# wget -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current14.xml-p003125001p003924999.bz2
# wget -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current15.xml-p003925001p004825000.bz2
# wget -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current16.xml-p004825002p006025000.bz2
# wget -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current17.xml-p006025001p007524997.bz2
# wget -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current18.xml-p007525002p009225000.bz2
# wget -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current19.xml-p009225001p011125000.bz2
# wget -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current20.xml-p011125001p013324998.bz2
# wget -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current21.xml-p013325001p015725000.bz2
# wget -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current22.xml-p015725003p018225000.bz2
# wget -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current23.xml-p018225001p020925000.bz2
# wget -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current24.xml-p020925002p023725000.bz2
# wget -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current25.xml-p023725001p026624999.bz2
# wget -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current26.xml-p026625002p029625000.bz2
# wget -c http://dumps.wikimedia.org/enwiki/20140203/enwiki-20140203-pages-meta-current27.xml-p029625001p041836446.bz2
#!/bin/bash
. environment.sh
hadoop jar software_release/parts/hadoop-streaming/*jar -mapper gutenberg-mapper.py -reducer gutenberg-reducer.py -input gutenberg/* -output gutenberg-output
[buildout]
extends =
../component/java/buildout.cfg
../stack/slapos.cfg
parts =
java
[buildout]
extends =
../../stack/slapos.cfg
../../component/java/buildout.cfg
parts =
slapos-cookbook
eggs
java
hadoop
hadoop-streaming
instance-stack
[eggs]
recipe = zc.recipe.egg
eggs =
slapos.cookbook
collective.recipe.template
cp.recipe.cmd
plone.recipe.command
[hadoop]
recipe = hexagonit.recipe.download
filename = hadoop-2.2.0.tar.gz
url = http://apache.mirrors.spacedump.net/hadoop/common/stable/${:filename}
md5sum = 25f27eb0b5617e47c032319c0bfd9962
download-only = true
mode = 0644
strip-top-level-dir = true
[hadoop-streaming]
recipe = hexagonit.recipe.download
url = http://repo1.maven.org/maven2/org/apache/hadoop/hadoop-streaming/0.20.203.0/hadoop-streaming-0.20.203.0.jar
download-only = true
#md5sum =
mode = 0644
[instance-stack]
recipe = slapos.recipe.template
url = ${:_profile_base_location_}/instance-stack.cfg.in
output = ${buildout:directory}/instance-stack.cfg
# md5sum =
mode = 0644
......@@ -2,10 +2,6 @@
parts =
sh-environment
put-files
mapper
reducer
run-demo
start-daemons
deploy-tar
......@@ -36,48 +32,16 @@ command =
[ -d $${directories:hadoop-prefix}/bin} ] || tar xf ${hadoop:location}/${hadoop:filename} -C $${directories:hadoop-prefix} --strip-components=1
[directories]
recipe = slapos.cookbook:mkdirectory
bin = $${buildout:directory}/bin
etc = $${buildout:directory}/etc
var = $${buildout:directory}/var
hadoop-prefix = $${buildout:directory}/hadoop
services = $${directories:etc}/service
promises = $${directories:etc}/promise
[put-files]
recipe = slapos.recipe.template
url = ${:_profile_base_location_}/template/bin/put-files.sh.in
output = $${directories:bin}/put-files.sh
# md5sum =
mode = 0755
# http://www.michael-noll.com/tutorials/writing-an-hadoop-mapreduce-program-in-python/
[mapper]
recipe = slapos.recipe.template
url = ${:_profile_base_location_}/template/bin/gutenberg-mapper.py.in
output = $${directories:bin}/gutenberg-mapper.py
# md5sum =
mode = 0755
[reducer]
recipe = slapos.recipe.template
url = ${:_profile_base_location_}/template/bin/gutenberg-reducer.py.in
output = $${directories:bin}/gutenberg-reducer.py
# md5sum =
mode = 0755
[run-demo]
recipe = slapos.recipe.template
url = ${:_profile_base_location_}/template/bin/run-demo.sh.in
output = $${directories:bin}/run-demo.sh
# md5sum =
mode = 0755
[start-daemons]
recipe = slapos.recipe.template
url = ${:_profile_base_location_}/template/bin/start-daemons.sh.in
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!