cancionero-web/src/latex_scanner.py

import argparse
import os
import re

from django.conf import settings
from django.template import Engine, Context
from os.path import join
from pathlib import Path

from audio_scanner import find_audios
from model import Chord, Line, Song, Verse


def mkdir(path: str) -> None:
    if not os.path.exists(path):
        os.mkdir(path)


# Note that re.match prepends ^ to the pattern, whereas re.search doesn't


def read_property(text: str, key: str) -> str | None:
    '''A parser for \\beginsong attributes.'''
    if text is None:
        return None
    match = re.search(key + "={(.*?)}", text)
    return match.group(1) if match else None


def extra_put(extra: list, index: int, the_type: str, data: dict|None = None):
    '''Adds the given data (if any) to the extra list, \
    at the given index, and tagged with the given the_type.'''
    payload = {'type': the_type, 'data': data} if data else {'type': the_type}
    if index not in extra:
         extra[index] = []
    extra[index].append(payload)


class SongLoader:
    '''Parses LaTeX files to build a collection of song objects.'''

    def __init__(self, latex_file: str, audio_dir: str | None = None):
        '''Initializes and populates a LaTeX reader.'''
        self.index: int            = 1
        self.category: str | None  = None
        self.categories: list[str] = []
        self.songs: list[Song]     = []
        if audio_dir:
            self.audio_dir = audio_dir
        self.scan(latex_file)

    def scan(self, latex_file: str) -> None:
        '''Reads through an index file and scans each song, with the same numbers.
        :param latex_file: The main latex file, which includes songs via \\input.'''
        main_file = open(latex_file, 'r')
        for line in main_file.readlines():
            # Remove newline
            if line[-1] == '\n':
                line = line[:-1]
            # Remove comments
            line = re.sub(r"%.*$", "", line)
            # Read counter and category change (max 1 per line)
            re_set_counter_match = re.search(r"\\setcounter{songnum}{(\d+)}", line)
            if re_set_counter_match is not None:
                self.index = int(re_set_counter_match.group(1))
            re_chapter_match = re.search(r"\\songchapter{(.*?)}", line)
            if re_chapter_match is not None:
                self.category = re_chapter_match.group(1)
                self.categories.append(self.category)
            # Traverse into \input commands if path starts w/ 'canciones/'
            re_input_match = re.search(r"\\input{(.*?)}", line)
            if re_input_match is not None:
                input_file = join(str(Path(latex_file).parent), re_input_match.group(1))
                if not input_file.endswith(".tex"):
                    input_file += ".tex"
                self.scan_song_file(input_file)

    def scan_others(self, folder: str, start_index: int) -> None:
        '''Looks for songs not found during self.scan(...).
        :param folder:      A folder to scan through, looking for .tex files.
        :param start_index: When numbering new songs, the first index to use.
        '''
        self.index = start_index
        self.category = "Nuevas"
        self.categories.append(self.category)
        files_scanned = [s.latex_file for s in self.songs]
        files_to_scan = [os.path.join(root, name) for root, dirs, files in os.walk(folder, topdown=False) for name in
                         files]
        files_to_scan = [f for f in files_to_scan if f.endswith(".tex") and f[f.index('/') + 1:] not in files_scanned]
        files_to_scan = sorted(files_to_scan)
        for f in files_to_scan:
            print("Scanning extra file", f)
            self.scan_song_file(f)

    def scan_song_file(self, song_file: str) -> None:
        '''Scan a single song file and store any songs found.'''
        # Variables
        ignore: bool                = False
        current_song: Song | None   = None
        current_verse: Verse | None = None
        memory: str | None          = None
        memorizing: bool            = False
        replay_index: int           = 0
        transpose: int              = 0
        trfmt: str                  = "normal"

        # General behaviour: read the file and scan line-by-line
        # In each line, read char-by-char, searching for common LaTeX commands
        # Apply the effects of these commands, and add each verse (line inside
        # song after removing all commands) to build up each song.
        # Commands include:
        ### Comments: % Something something -> Ignored
        ### Line break locations: \brk -> Ignored
        ### Transpose:        \transpose{SEMITONES}
        ### Begin song:       \beginsong{NAME}[METADATA]
        ### End song:         \endsong
        ### Verse begin/end:  \beginverse, \endverse
        ### Chorus begin/end: \beginchorus, \endchorus
        ### Capo:             \capo{FRET}
        ### Chord-excl. txt.: \ifchorded, \else, \fi
        ###    (the contents between \else and \fi are discarded)
        ### Echoes:           \echo{TEXT}
        ### Chord:            \[CHORD]
        ### Chord repetition: \^
        ### Music repetition: \lrep, \rrep
        ### Lyric repetition: \rep{TIMES}
        ### Chord memory:     \memorize, \replay
        ### Transpose format: \renewcommand{\trchordformat}[2]{.*}
        ### Other unrecognized commands: \NAME([ARG]|{ARG})*
        for line in open(song_file, "r").readlines():
            # Remove newline
            if line[-1] == '\n':
                line = line[:-1]
            # Remove comments and \brk commands
            text = re.sub(r"%.*$", "", line)
            text = re.sub(r"\\brk({})?", '', text)
            text = re.sub(r"``", u"\u201C", text)
            text = re.sub(r"''", u"\u201D", text)
            text = re.sub(r"`", u"\u2018", text)
            text = re.sub(r"'", u"\u2019", text)

            extras = {}
            i = 0
            while i <= len(text):
                beginning = text[:i]
                remain = text[i:]
                if re.match(r"\\fi", remain):
                    ignore = False
                    text = beginning + text[i + len("\\fi"):]
                    continue
                if ignore:
                    i += 1
                    continue

                # Command lookup
                if re_transpose_match := re.match(r"\\transpose *?{(-?\d+?)}", remain):
                    text = beginning + text[i + len(re_transpose_match.group(0)):]
                    transpose = int(re_transpose_match.group(1))
                    continue
                if re_song_begin_match := re.match(r"\\beginsong *?{(.*?)}(\[.*?])?", remain):
                    text = beginning + text[i + len(re_song_begin_match.group(0)):]
                    if current_song is not None:
                        print("error end-begin song! %s at %s" % (line, song_file))
                        self.songs.append(current_song)
                        self.index += 1
                    current_song = Song(re_song_begin_match.group(1), self.index,
                                        author=read_property(re_song_begin_match.group(2), "by"),
                                        origin=read_property(re_song_begin_match.group(2), "m"),
                                        category=self.category,
                                        latex_file=song_file[song_file.index('/') + 1:])
                    transpose = 0
                    trfmt = "normal"
                    memory = None
                    memorizing = False
                    replay_index = 0
                    if hasattr(self, "audio_dir"):
                        for a in find_audios(self.index, self.audio_dir):
                            current_song.add_audio(a)
                    continue
                if re.match(r"\\endsong", remain):
                    text = beginning + text[i + len("\\endsong"):]
                    self.songs.append(current_song)
                    current_song = None
                    self.index += 1
                    continue
                if re_verse_cmd_match := re.match(r"\\(begin|end)(verse|chorus)", remain):
                    text = beginning + text[i + len(re_verse_cmd_match.group(0)):]
                    is_chorus = re_verse_cmd_match.group(2) == "chorus"
                    if current_song is None:
                        print("verse %s found outside song in %s" % (line, song_file))
                    if re_verse_cmd_match.group(1) == "begin":
                        if current_verse is not None:
                            print("error end-begin verse! %s at %s" % (line, song_file))
                            current_song.add_verse(current_verse)
                        if not is_chorus and memory is None:
                            memory = []
                            memorizing = True
                        replay_index = 0
                        current_verse = Verse(is_chorus)
                    else:  # end of verse/chorus
                        if current_verse.is_chorus != is_chorus:
                            print("ended chorus-verse with wrong command?")
                        memorizing = False
                        current_song.add_verse(current_verse)
                        current_verse = None
                    continue
                if (re_capo_match := re.match(r"\\capo{(\d+?)}", remain)) and current_song:
                    text = beginning + text[i + len(re_capo_match.group(0)):]
                    current_song.set_capo(int(re_capo_match.group(1)))
                    continue
                if re.match(r"\\ifchorded", remain):
                    text = beginning + text[i + len("\\ifchorded"):]
                    continue
                if re.match(r"\\else", remain):
                    ignore = True
                    text = beginning + text[i + len("\\else"):]
                    continue
                if re_echo_match := re.match(r"\\echo[ \t]*?{((.|{.*?})*?)}", remain):
                    text = beginning + re_echo_match.group(1) + "\\echoend" + text[i + len(re_echo_match.group(0)):]
                    extra_put(extras, i, "echo")
                    continue
                if re.match(r"\\echoend", remain):
                    text = beginning + text[i + len("\\echoend"):]
                    extra_put(extras, i, "echo")
                    continue
                if re_chord_match := re.match(r"\\\[(.+?)]", remain):
                    text = beginning + text[i + len(re_chord_match.group(0)):]
                    c = Chord(re_chord_match.group(1), transpose, trfmt)
                    extra_put(extras, i, "chord", c)
                    if memorizing:
                        memory.append(c)
                    continue
                if re.match(r"\^", remain):
                    text = beginning + text[i + len("^"):]
                    if memory is not None and replay_index < len(memory):
                        extra_put(extras, i, "chord", memory[replay_index])
                        replay_index += 1
                    continue
                if re_dir_rep_match := re.match(r"\\([lr]rep)", remain):
                    text = beginning + text[i + len(re_dir_rep_match.group(0)):]
                    extra_put(extras, i, "dir-rep", re_dir_rep_match.group(1))
                    continue
                if re_rep_match := re.match(r"\\rep{(\d+?)}", remain):
                    text = beginning + text[i + len(re_rep_match.group(0)):]
                    extra_put(extras, i, 'rep', int(re_rep_match.group(1)))
                    continue
                if re.match(r"\\memorize", remain):
                    text = beginning + text[i + len("\\memorize"):]
                    memory = []
                    memorizing = True
                    continue
                if re.match(r"\\replay", remain):
                    text = beginning + text[i + len("\\replay"):]
                    replay_index = 0
                    continue
                # Double or single transpose mode
                if re_trfmt := re.match(r"\\renewcommand{\\trchordformat}\[2\]{\\vbox{\\hbox{#1}\\hbox{#2}}}", remain):
                    text = beginning + text[i + len(re_trfmt.group(0)):]
                    trfmt = "double"
                    continue
                if re_trfmt := re.match(r"\\renewcommand{\\trchordformat}\[2\]{\\vbox{\\hbox{#1}\\hbox{}}}", remain):
                    text = beginning + text[i + len(re_trfmt.group(0)):]
                    trfmt = "hover"
                    continue
                if re_trfmt := re.match(r"\\renewcommand{\\trchordformat}\[2\]{\\hbox{#2}}", remain):
                    text = beginning + text[i + len(re_trfmt.group(0)):]
                    trfmt = "normal"
                    continue
                # Command lookup end, removing any unrecognized command
                re_macro_match = re.match(r"\\([^ \t{\[]+)[ \t]*?({.*?}|\[.*?])*", remain)
                if re_macro_match:
                    text = beginning + text[i + len(re_macro_match.group(0)):]
                    print("Removed an unrecognized command:", re_macro_match.group(0))
                    continue
                i += 1
            if not current_verse and text.strip() != '':
                print("l outside v:", text)
                continue
            if ignore or text.strip() == '':
                continue
            current_verse.add_line(Line(text, extras))

    def sort_categories(self) -> dict[str, list[Song]]:
        '''Returns a dictionary of categories to lists of songs (sorted by number).'''
        result = {}
        for c in self.categories:
            result[c] = sorted([s for s in self.songs if s.category == c],
                               key=lambda s: s.number)
        return result

    def print_index(self, index_file, dj_engine):
        context = Context({'sorted_categories': self.sort_categories()})
        html = dj_engine.get_template("index.html").render(context)
        with open(index_file, 'w') as f:
            f.write(html)

    @staticmethod
    def print_song(song, directory, dj_engine):
        context = Context({'song': song})
        num_dir = join(directory, "%03d" % song.number)
        mkdir(num_dir)
        with open(join(num_dir, "index.html"), 'w') as f:
            f.write(dj_engine.get_template("song_redir.html").render(context))
        song_dir = join(directory, song.url())
        mkdir(song_dir)
        with open(join(song_dir, "index.html"), 'w') as f:
            f.write(dj_engine.get_template("song.html").render(context))

    def generate_html(self, output_dir, dj_engine):
        mkdir(output_dir)
        for song in self.songs:
            self.print_song(song, output_dir, dj_engine)
        self.print_index(join(output_dir, "index.html"), dj_engine)


def create_argparser():
    parser = argparse.ArgumentParser()
    parser.add_argument("--latex", required=True, nargs=1, help="The main LaTeX file. It may include other documents")
    parser.add_argument("--other-latex", required=False, nargs=1, default=[None],
                        help="A folder with songs, those not referenced in the main file will be included at the end.")
    parser.add_argument("--other-index", required=False, nargs=1, type=int, default=[400],
                        help="The first song number for songs outside the main songbook.")
    parser.add_argument("--audios", required=False, nargs=1, default=[None],
                        help="The folder containing the audio files.")
    parser.add_argument("--output-dir", required=False, nargs=1, default=["public"])
    return parser


if __name__ == '__main__':
    args = create_argparser().parse_args()
    loader = SongLoader(args.latex[0], args.audios[0])
    if args.other_latex:
        loader.scan_others(args.other_latex[0], int(args.other_index[0]))
    settings.configure(USE_TZ=False, USE_I18N=False)
    e = Engine(dirs=["res/templates/"])
    loader.generate_html(args.output_dir[0], e)