#!/usr/bin/env python3 # -*- coding: utf-8 -*- """Markov's own password generator, approved by Shannon. Passwords are generated based on a markovian model of a word corpus. If the corpus contains prononceable words, generated words will probably be prononceable too. The password will be sequence of words generated by the model joined by a configurable separator. One special character is added after each word. Words will be added until the target entropy is reached. Models are stored in ~/.local/share/andrei. Usage: andrei --help andrei [--clip] [--entropy N] [--model NAME] [--min-word-len X] [--max-word-len Y] [--specials SPECIALS] [--sep SEP] andrei modelize [--filter=REGEX] STATE_SIZE NAME FILE... Options: -h, --help show this help message -c, --clip copy the password to the clipboard instead of printing it -e N, --entropy=N minimum entropy for the password [default: 50] -m NAME, --model=NAME model used for generation [default: latin_3] --min-word-len=X discard words shorter than X [default: 5] --max-word-len=Y discard words longer than Y [default: 10] --specials=SPECIALS special characters to put after each word [default: 0123456789!@#$%^&*?+=] --sep=SEP word separator [default: -] --filter=REGEX regex used to filter words in corpus files [default: \\b(\w+)\\b] Note: Python's `os.urandom` is used as a secure randomness source. If it is unavailable, a warning will be emitted and it will fall back to python's default random generator (Mersenne Twister). In the later case you SHOULD NOT use the generated password for security purpose. In any case, USE AT YOUR OWN RISK. See https://docs.python.org/3.5/library/os.html#os.urandom for more info. """ from bisect import bisect from collections import namedtuple from itertools import accumulate from math import log, ceil from os import path, makedirs import pickle import random import re import textwrap from docopt import docopt import logbook #import pyperclip _sysrand = random.SystemRandom() try: _sysrand.randrange(10) except NotImplementedError: logbook.warning('could not find a reliable randomness source: DO NOT USE ' 'IT FOR SECURITY PURPOSE') randrange = random.randrange else: randrange = _sysrand.randrange def word_list(files, filter=r'\b([a-zA-Z][a-zA-Z]+)\b'): pat = re.compile(filter) words = [] for f in files: with open(f) as s: words.extend(pat.findall(s.read())) return [w.lower() for w in words] def count_transitions(words, n): trans = {} for w in words: xs = '\x00'*n + w + '\x01' for i in range(len(w)+1): curr = xs[i:i+n] succ = xs[i+n] if curr not in trans: trans[curr] = {succ: 1} elif succ not in trans[curr]: trans[curr][succ] = 1 else: trans[curr][succ] += 1 return trans Node = namedtuple('Node', ('choices', 'cumdist', 'entropy')) def build_model(trans): model = {} for (state, succs) in trans.items(): tot = sum(succs.values()) ord = tuple(succs.items()) # get some fixed order model[state] = Node( choices=tuple(x[0] for x in ord), cumdist=tuple(accumulate([x[1] for x in ord])), entropy=-sum(f/tot * log(f/tot, 2) for f in succs.values())) return model class Generator: def __init__(self, words=None, state_size=None, path=None): if path is not None: with open(path, 'rb') as s: self.state_size, self.model = pickle.load(s) else: assert words is not None and state_size is not None, 'bad arguments' self.state_size = state_size trans = count_transitions(words, state_size) self.model = build_model(trans) def dump_model(self, path): with open(path, 'wb') as s: pickle.dump((self.state_size, self.model), s) def generate(self): state = '\x00' * self.state_size out = [] entropy = 0 while True: node = self.model[state] entropy += node.entropy r = randrange(node.cumdist[-1]) succ = node.choices[bisect(node.cumdist, r)] if succ == '\x01': return ''.join(out), entropy out.append(succ) state = state[1:] + succ def generate_password(self, min_entropy, min_word_len=5, max_word_len=10, specials='0123456789!@#$%^&*?+=~', sep='-'): words = [] entropy = 0 n = len(specials) sp_ent = log(n, 2) if n > 0 else 0 while entropy < min_entropy: while True: w, e = self.generate() if min_word_len <= len(w) < max_word_len: break entropy += e if n > 0: s = specials[randrange(n)] entropy += sp_ent words.append(w + s) else: words.append(w) return (sep.join(words), entropy) def main(): logbook.StderrHandler(format_string='{record.level_name}: {record.message}').push_application() args = docopt(__doc__) BASE = path.expanduser(path.join('~', '.local', 'share', 'andrei')) makedirs(BASE, exist_ok=True) if args['modelize']: words = word_list(args['FILE'], args['--filter']) logbook.info(textwrap.shorten('Found {} words: {}'.format( len(words), ', '.join(words[:50])), width=70, placeholder='...')) gen = Generator(words, int(args['STATE_SIZE'])) gen.dump_model(path.join(BASE, args['NAME'])) print('Successfully generated model {}'.format(args['NAME'])) else: gen = Generator(path=path.join(BASE, args['--model'])) pw, ent = gen.generate_password( int(args['--entropy']), int(args['--min-word-len']), int(args['--max-word-len']), args['--specials'], args['--sep']) logbook.info('entropy: {:.3f}'.format(ent)) if args['--clip']: pyperclip.copy(pw) else: print(pw) if __name__ == '__main__': main()