# Copyright (c) 2017, 2023  Luke Shumaker <lukeshu@lukeshu.com>
#
# This work is free.  You can redistribute it and/or modify it under
# the terms of the Do What The Fuck You Want To Public License,
# Version 2, as published by Sam Hocevar.  See the COPYING file for
# more details.

SHELL=bash -o pipefail
PATH:=$(CURDIR)/bin:$(PATH)
export PATH

url2murl = $(subst %,^25,$(subst :,^3A,$(subst ^,^5E,$1)))
murl2url = $(subst ^5E,^,$(subst ^3A,:,$(subst ^25,%,$1)))
dirfail = ( r=$$?; mv -- '$@'{,.bak}; exit $$r; )

# This is split into stages for when Make has to make decisions about
# the build tree based on the output of a previous stage.  That is:
# these stages exist for a technical GNU Make reason, not for
# human-comprehensibility reasons; so stages have lopsided sizes; the
# first two are very small, and almost everything is in the third
# stage.
all:
	# Stage 1 ######################################################################
	$(MAKE) dat/urlkeys.mk
	# Stage 2 ######################################################################
	$(MAKE) dat/index.mk
	# Stage 3 ######################################################################
	$(MAKE) dat/git
.PHONY: all

COPYING:
	curl -L http://www.wtfpl.net/txt/copying/ >$@

# Stage 1 ######################################################################
#
# Fetch a listing of all relevant URLs.
#
#  - `dat/cdxindex.txt`
#  - `dat/urlkeys.txt`
#  - `dat/urlkeys.mk`

dat:
	mkdir -p $@
dat/cdxindex.txt: | dat
	cdxget 'url=www.unicode.org/Public/*' 'fl=urlkey' 'collapse=urlkey' 'filter=statuscode:200' 'filter=urlkey:.*(cvt|convert)utf.*' | grep -vFx 'org,unicode)/public/2.0-update/cvtutf7.c' > $@
dat/urlkeys.txt: dat/cdxindex.txt
	< $< cut -d '?' -f1 | sort -u > $@
dat/urlkeys.mk: dat/urlkeys.txt
	< $< sed 's/^/urlkeys+=/' > $@

# Stage 2 ######################################################################
#
# Fetch the history for each relevant URL.
#
# - `dat/each-cdx/$(urlkey).txt` (for each urlkey in `dat/urlkeys.mk`)
#
# - `dat/index.txt`
#   has a line for each relevant URL:
#
#       ${wayback_timestamp:YYYYmmddHHMMSS} ${url}
#
# - `dat/index.mk`
ifneq ($(wildcard dat/urlkeys.mk),)
include dat/urlkeys.mk

dat/each-cdx/%.txt:
	@mkdir -p '$(@D)'
	cdxget "url=$$(urlkey2url '$*')" 'filter=statuscode:200' 'fl=timestamp,original' > '$@'
dat/index.txt: $(addprefix dat/each-cdx/,$(addsuffix .txt,$(urlkeys))) dat/urlkeys.txt
	cat -- $(foreach c,$(filter dat/each-cdx/%,$^),'$c') | sort > $@
dat/index.mk: dat/index.txt
	< $< sed -e 's,^,index+=,' -e 's, ,/,' > $@

# Stage 3 ######################################################################
#
# The main stage.
ifneq ($(wildcard dat/index.mk),)
-include dat/index.mk

# Part 1: Directory indexes:
#
# - `dat/content-dir/$(wayback_timestamp:YYYYmmddHHMMSS)/$(dir_murl)/index.html`
#
# - `dat/content-dir/$(wayback_timestamp:YYYYmmddHHMMSS)/$(dir_murl)/readme.txt`
#
# - `dat/content-dir/$(wayback_timestamp:YYYYmmddHHMMSS)/$(dir_murl)/metadata.txt`
#   has a line for each file mentioned in index.html (this format is
#   controlled by `bin/fmt-metadata`):
#
#          ${file_name} ${file_timestamp:YYYY-mm-dd HH:MM}
dat/content-dir/%/index.html:
	@mkdir -p '$(@D)'
	curl -sfL 'http://web.archive.org/web/$(call murl2url,$(subst /http,id_/http,$*))' > $@
dat/content-dir/%/readme.txt: dat/content-dir/%/index.html
	< $< sed -n '/^<[pP][rR][eE]>$$/,/<\/[pP][rR][eE]>/p' | sed -e 1d -e 's,</[pP][rR][eE]>.*,,'  > $@
dat/content-dir/%/metadata.txt: dat/content-dir/%/index.html
	< $< grep -i '^<img' | sed 's/<[^>]*>//g' | grep -vi 'parent directory' | fmt-metadata $(firstword $(subst /, ,$*)) > $@
content-dir = $(foreach u,$(filter %/,$(index)),dat/content-dir/$(call url2murl,$(u)))
download += $(addsuffix readme.txt,$(content-dir)) $(addsuffix metadata.txt,$(content-dir))

# Part 2: File contents:
# - `dat/content-file/$(wayback_timestamp:YYYYmmddHHMMSS)/$(file_murl)`
dat/content-file/%:
	@mkdir -p '$(@D)'
	curl -sfL 'http://web.archive.org/web/$(call murl2url,$(subst /http,id_/http,$*))' > $@
content-file = $(foreach u,$(filter-out %/,$(index)),dat/content-file/$(call url2murl,$(u)))
download += $(content-file)

# `download` is a convenience target to download files without
# processing them.  It isn't depended on by anything.
download: $(download)
.PHONY: download

# Part 3: Aggregate:
# - `dat/metadata.txt`
#   has a line for each file mentioned in any index.html:
#
#          ${dirindex_wayback_timestamp:YYYYmmddHHMMSS} ${branch_name}/${file_name} ${file_html_timestamp:YYYY-mm-dd HH:MM}
#
#   where the ${dirindex_wayback_timestamp} and ${branch_name} are
#   determined from the path to the relevant index.html.
#
# - `dat/pools/`
#   + pass 1 and pass 1.5
#     * `dat/pools/files/${file_html_timestamp:YYYYmmddHHMM}-${branch_name}_${file_name}/`
#     * `dat/pools/snaps/${dirindex_wayback_timestamp:YYYYmmddHHMMSS}-${branch_name}/${file_name}` (symlink to the /files/ file)
#   + pass 2 and pass 3:
#     * `dat/pools/files/${file_html_timestamp:YYYYmmddHHMM}-${branch_name}_${file_name}/${file_name}` (for each existing /file/ dir)
#
dat/metadata.txt: $(addsuffix metadata.txt,$(content-dir)) dat/index.txt
	grep ^ $(foreach c,$(filter %/metadata.txt,$^),'$c') | sed -E -e 's,^dat/content-dir/,,' -e 's,/.*/Public/, ,' -e 's,/metadata\.txt:,/,' -e 's,\s+, ,g' | sort -u > $@
dat/pools: $(download) dat/metadata.txt dat/index.txt
	rm -rf -- $@ $@.bak
	poolify dat/metadata.txt dat/index.txt || $(dirfail)

# Part 4: Turn each `dat/pools/snaps/*` directory into a Git commit
#
# - `dat/git/`
dat/git: dat/pools $(addsuffix readme.txt,$(content-dir)) $(addsuffix metadata.txt,$(content-dir))
	rm -rf -- $@ $@.bak
	gitify $@ || $(dirfail)

################################################################################
endif
endif

.DELETE_ON_ERROR:
.SECONDARY: