OrderBib

A script to sort the entries of my bibTeX file sci.bib in their natural order, which is namexyα with name the first Author's last name, xy the last two digits of the year of publication and α a letter a, b, c etc. to lift degeneracy when the same Author publishes more than one paper per year. This means that kavokin98a should appear before kavokin21a because 1998<2021, despite 98>21.

Usage

python orderbib.py sci.bib > sorted.bib

On stderr, the number of invalid entries (if any) is given in addition to the total number of entries processed, e.g., when using it for the first time before switching to automatic insertions of doi2bib:

Total entries: 4405
Valid entries: 4405
Invalid entries: 0

Source

First version v°1.0 on 8 September (2025).

#  ___          _           ____  _ _     
# / _ \ _ __ __| | ___ _ __| __ )(_) |__  
#| | | | '__/ _` |/ _ \ '__|  _ \| | '_ \ 
#| |_| | | | (_| |  __/ |  | |_) | | |_) |
# \___/|_|  \__,_|\___|_|  |____/|_|_.__/ 
# F.P. Laussy http://laussy.org/wiki/orderbib
# v°1.0 - Mon Sep  8 05:26:26 PM CEST 2025
# This reorder my bib, according to my rules!

import re
import sys

def main():
    if len(sys.argv) != 2:
        print("Usage: python reorder_bibtex.py <bibtex_file>", file=sys.stderr)
        sys.exit(1)

    filename = sys.argv[1]
    with open(filename, 'r') as f:
        content = f.read()

    # Find @string entries
    string_pattern = r'(@string\s*\{.*?}\s*)'
    strings = re.findall(string_pattern, content, re.DOTALL | re.IGNORECASE)

    # Find all BibTeX entries, excluding @string
    entry_pattern = r'(@(?!string\b)\w+\{([^,]+),\s*(.*?)\n\s*\}\s*)'
    entries = re.findall(entry_pattern, content, re.DOTALL | re.IGNORECASE)

    valid = []
    invalid_keys = []

    for full_entry, key, fields in entries:
        # Extract year
        year_match = re.search(r'year\s*=\s*\{?(\d+)\}?,?', fields, re.IGNORECASE)
        if not year_match:
            invalid_keys.append(key)
            continue
        full_year = int(year_match.group(1))

        # Check key format: name (letters) + 2 digits + 1 letter
        key_match = re.match(r'([a-z]+)(\d{2})([a-z])$', key.strip(), re.IGNORECASE)
        if not key_match:
            invalid_keys.append(key)
            continue

        name, yy, letter = key_match.groups()
        # Optionally: check if yy matches last two digits of year
        # if yy != str(full_year)[-2:]:
        #     invalid_keys.append(key)
        #     continue

        valid.append({
            'name': name.lower(),  # case-insensitive, but assuming lowercase
            'full_year': full_year,
            'letter': letter.lower(),
            'full_entry': full_entry.strip()
        })

    # Sort valid entries: by name (str), full_year (int), letter (str)
    valid_sorted = sorted(valid, key=lambda x: (x['name'], x['full_year'], x['letter']))

    # Output statistics to stderr
    total_entries = len(entries)
    valid_count = len(valid_sorted)
    invalid_count = len(invalid_keys)

    print(f"Total entries: {total_entries}", file=sys.stderr)
    print(f"Valid entries: {valid_count}", file=sys.stderr)
    print(f"Invalid entries: {invalid_count}", file=sys.stderr)
    if invalid_keys:
        print("Invalid keys:", file=sys.stderr)
        for k in invalid_keys:
            print(k, file=sys.stderr)

    # Output @string entries to stdout without extra newlines
    if strings:
        print('\n'.join(s.strip() for s in strings))
        print()  # Single newline after allMarina, single empty line between @string entries and BibTeX entries

    # Output sorted entries to stdout
    if valid_sorted:
        for item in valid_sorted:
            print(item['full_entry'])
            print()  # Empty line between entries

if __name__ == "__main__":
    main()