From 2631b600d153aeda1d4201164dafc023dfdceedb Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Fri, 30 Jun 2017 22:27:24 -0400 Subject: download the actual data --- Makefile | 28 +++++++++++++++++++++++++++- bin/wayfore | 4 ++++ 2 files changed, 31 insertions(+), 1 deletion(-) create mode 100755 bin/wayfore diff --git a/Makefile b/Makefile index 262d3af..5cb9fdc 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,10 @@ SHELL=bash -o pipefail PATH:=$(CURDIR)/bin:$(PATH) export PATH -all: dat/index.txt +url2murl = $(subst %,^25,$(subst :,^3A,$(subst ^,^5E,$1))) +murl2url = $(subst ^5E,^,$(subst ^3A,:,$(subst ^25,%,$1))) + +all: download fix: grep -rl '

503' dat | xargs rm -fv -- @@ -23,6 +26,29 @@ dat/each-cdx/%.txt: cdxget "url=$$(urlkey2url '$*')" 'filter=statuscode:200' 'collapse=digest' 'fl=timestamp,original' > '$@' dat/index.txt: $(addprefix dat/each-cdx/,$(addsuffix .txt,$(urlkeys))) cat -- $(foreach c,$^,'$c') | sort > $@ +dat/index.mk: dat/index.txt + < $< sed 's,^,index+=web.archive.org/web/,;s, ,/,' > $@ + +-include dat/index.mk + +dat/content-dir/%/index.wahtml: + @mkdir -p '$(@D)' + curl -s 'http://$(call murl2url,$*)' > $@ +dat/content-dir/%/index.html: dat/content-dir/%/index.wahtml + wayfore < $< > $@ +dat/content-dir/%/readme.txt: dat/content-dir/%/index.html + < $< sed -n '/^
$$/,/<\/pre>/p' | sed -e 1d -e 's,
.*,,' > $@ +dat/content-dir/%/metadata.txt: dat/content-dir/%/index.html + < $< grep '^]*>//g' | grep -vi 'parent directory' > $@ +content-dir = $(foreach u,$(filter %/,$(index)),dat/content-dir/$(call url2murl,$(u))) +download: $(addsuffix readme.txt,$(content-dir)) $(addsuffix metadata.txt,$(content-dir)) + +dat/content-file/%: + @mkdir -p '$(@D)' + curl -s 'http://$(call murl2url,$*)' > $@ +content-file = $(foreach u,$(filter-out %/,$(index)),dat/content-file/$(call url2murl,$(u))) +download: $(content-file) +.PHONY: all fix download .DELETE_ON_ERROR: .SECONDARY: diff --git a/bin/wayfore b/bin/wayfore new file mode 100755 index 0000000..b0bde8a --- /dev/null +++ b/bin/wayfore @@ -0,0 +1,4 @@ +#!/usr/bin/sed -zrf +# The opposite of 'wayback' +s/(<[hH][eE][aA][dD]>).*/\1/ +s/.*// -- cgit v1.1-4-g5e80