summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Shumaker <lukeshu@lukeshu.com>2017-06-30 22:27:24 -0400
committerLuke Shumaker <lukeshu@lukeshu.com>2017-06-30 22:27:24 -0400
commit2631b600d153aeda1d4201164dafc023dfdceedb (patch)
treeeb3458af979ed98230e5d5ce40709202df14f2ed
parent99011e7fcebeccc26a3da591e3445a93ffadad3c (diff)
download the actual data
-rw-r--r--Makefile28
-rwxr-xr-xbin/wayfore4
2 files changed, 31 insertions, 1 deletions
diff --git a/Makefile b/Makefile
index 262d3af..5cb9fdc 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,10 @@ SHELL=bash -o pipefail
PATH:=$(CURDIR)/bin:$(PATH)
export PATH
-all: dat/index.txt
+url2murl = $(subst %,^25,$(subst :,^3A,$(subst ^,^5E,$1)))
+murl2url = $(subst ^5E,^,$(subst ^3A,:,$(subst ^25,%,$1)))
+
+all: download
fix:
grep -rl '<html><body><h1>503' dat | xargs rm -fv --
@@ -23,6 +26,29 @@ dat/each-cdx/%.txt:
cdxget "url=$$(urlkey2url '$*')" 'filter=statuscode:200' 'collapse=digest' 'fl=timestamp,original' > '$@'
dat/index.txt: $(addprefix dat/each-cdx/,$(addsuffix .txt,$(urlkeys)))
cat -- $(foreach c,$^,'$c') | sort > $@
+dat/index.mk: dat/index.txt
+ < $< sed 's,^,index+=web.archive.org/web/,;s, ,/,' > $@
+
+-include dat/index.mk
+
+dat/content-dir/%/index.wahtml:
+ @mkdir -p '$(@D)'
+ curl -s 'http://$(call murl2url,$*)' > $@
+dat/content-dir/%/index.html: dat/content-dir/%/index.wahtml
+ wayfore < $< > $@
+dat/content-dir/%/readme.txt: dat/content-dir/%/index.html
+ < $< sed -n '/^<pre>$$/,/<\/pre>/p' | sed -e 1d -e 's,</pre>.*,,' > $@
+dat/content-dir/%/metadata.txt: dat/content-dir/%/index.html
+ < $< grep '^<img' | sed 's/<[^>]*>//g' | grep -vi 'parent directory' > $@
+content-dir = $(foreach u,$(filter %/,$(index)),dat/content-dir/$(call url2murl,$(u)))
+download: $(addsuffix readme.txt,$(content-dir)) $(addsuffix metadata.txt,$(content-dir))
+
+dat/content-file/%:
+ @mkdir -p '$(@D)'
+ curl -s 'http://$(call murl2url,$*)' > $@
+content-file = $(foreach u,$(filter-out %/,$(index)),dat/content-file/$(call url2murl,$(u)))
+download: $(content-file)
+.PHONY: all fix download
.DELETE_ON_ERROR:
.SECONDARY:
diff --git a/bin/wayfore b/bin/wayfore
new file mode 100755
index 0000000..b0bde8a
--- /dev/null
+++ b/bin/wayfore
@@ -0,0 +1,4 @@
+#!/usr/bin/sed -zrf
+# The opposite of 'wayback'
+s/(<[hH][eE][aA][dD]>).*<!-- End Wayback Rewrite JS Include -->/\1/
+s/<!-- BEGIN WAYBACK TOOLBAR INSERT -->.*<!-- END WAYBACK TOOLBAR INSERT -->//