view ppgen.py @ 7:8b2f8f439817

Improves: ding parser. * Strips greater and lesser signs in the beginning and end of words when reading a ding directory. Words enclosed by those characters seem to be variants. This affects about 100 to 200 words for de in de-en 1.7.
author Bernhard Reiter <bernhard@intevation.de>
date Tue, 21 Feb 2017 14:14:08 +0100
parents 81f75c9aac84
children 200c2c3c5f67
line wrap: on
line source
#!/usr/bin/env python3
"""Create a random passphrase from a dictionary of words. BETA

Relies on the entropy of python's
    random.SystemRandom class
        which (according to the documentation) calls os.urandom()
        which (according to the documentation) calls the operating system
           specific randomness source which "should be unpredictable
            enough for cryptographic applications"

Requires:
  * Python v>=3.2
  * a dictionary, Ding's trans-de-en by default.
    E.g. on a Debian/Ubuntu system in package "trans-de-en".
    or from http://ftp.tu-chemnitz.de/pub/Local/urz/ding/de-en/

Uses a hardcoded filepath and language.
Search for **customize** below to change it.

Related: There is a Go implementation started by Sascha L. Teichmann at
  https://bitbucket.org/s_l_teichmann/ppgen


Copyright 2016, 2017 by Intevation GmbH.
Author: Bernhard E. Reiter <bernhard@intevation.de>

This file is Free Software under the Apache 2.0 license and thus
comes without any warranty (to extend permissible under applicable law).
"""

import argparse
import math
import re
import sys

from random import SystemRandom
_srandom = SystemRandom()

tainted = False   # to be set if we find a hint that the passphrase may be weak


def buildDictionary(options):
    """Build up a dictionary of unique words, calculate stats."""
    global tainted
    d = []

    # dictionary for testing
    #d = ["abc", "aBc", "cde", "efg", "hij", "blubber",
    #      "jikf", "zug", "lmf", "opq"]
    # second test dictionary to show that different string functions are used.
    #d =  [''.join('A' * 1000) for _ in range(1000)]

    # Using the dictionary from Ding **customize**
    d = readDingDict(filename="/usr/share/trans/de-en", useLeft=True)

    ## for debugging purposes, dump dictionary
    if options.ddump_filename:
        print("Writing out dictionary in '{}'.".format(options.ddump_filename))
        with open(options.ddump_filename, "w") as f:
            for i in d:
                f.write("{}\n".format(i))

    # Print some stats on the dictionary to be used
    dl = len(d)
    print("Found {:d} dictionary entries.".format(dl))
    if dl < 8000:
        print("!Your dictionary is below 8k entries, that is quite small!")
        tainted = True

    print("|= Number of words |= possibilities |")
    for i in range(1, 5):
        print("|               {:2d} |    2^{:4.1f}      |".format(
              i, math.log(dl**i, 2)))
    return d


def readDingDict(filename="/usr/share/trans/de-en", useLeft=False):
    """Read dictionary with unique words from file in Ding format.

    useLeft: Boolean to control which language to use

    TODO: add option to use both languages for people that speak them both?
    """

    dset = set()  # using the datatype 'set' to avoid duplicates

    splitter = re.compile(r"""\ \|\  # first pattern  ' | '
                           |;\       # second pattern '; '
                           |(?<=\S)/(?=\S)  # 3.:    '/' surrounded by chars
                           |\s+      # by whitespace
                           """, re.VERBOSE)

    print("Reading entries from {}.".format(filename), end='')
    counter = 0  # for progress or stopping early
    with open(filename, "r") as f:
        for line in f:
            if line[0] == '#':
                continue

            # languages are separated by " :: "
            p = line.partition(" :: ")
            languageEntry = p[0] if useLeft else p[2]

            for word in splitter.split(languageEntry):
                word = word.strip('(",.)\'!:;<>').rstrip('/')
                if len(word) > 2 and not word[0] in '[{/':
                    dset.add(word)

            #TODO: check for very common words and remove them?

            counter += 1
            ## stop early when debugging
            #if counter > 10: break
            if not counter % 10000:
                    print('.', end='')
                    sys.stdout.flush()
        print()

    return list(dset)


def main():
    global tainted

    parser = argparse.ArgumentParser(description=__doc__.splitlines()[0])
    parser.add_argument('--ddump-filename',
                        help='filename to dump the dictionary to')
    options = parser.parse_args()

    dictionary = buildDictionary(options)

    howMany = 4

    # use a dictionary with lower cased words for a simple check if
    # our random source is okay
    print("\nGenerated passphrase with {} randomly selected words:\n".format(
          howMany))
    print("    ", end='')
    words = {}
    for x in range(howMany):
        word = _srandom.choice(dictionary)
        words[word.lower()] = True
        print(word, end='\n    ')
    print("\n")

    if len(words) < howMany:
        print("! Your random generator is weak")
        print("! or you are being very lucky.")
        tainted = True

    if tainted:
        print("!!! Don't use the resulting passphrase !!!")

if __name__ == "__main__":
    main()
This site is hosted by Intevation GmbH (Datenschutzerklärung und Impressum | Privacy Policy and Imprint)