diff --git a/BirthdayProblem.py b/BirthdayProblem.py index 019c2c4..f187fa6 100644 --- a/BirthdayProblem.py +++ b/BirthdayProblem.py @@ -93,7 +93,7 @@ def isLessThan(a, b): @staticmethod def isGreaterThan(a, b): return _DecimalContext.ctx.compare(a, b) == _DecimalFns.ONE - + @staticmethod def areEqual(a, b): return _DecimalContext.ctx.compare(a, b) == _DecimalFns.ZERO @@ -175,7 +175,7 @@ def facultyNTakeMLogE(n, nLogE, m): @staticmethod def facultyLog(n, nLog, isLog2): if _DecimalFns.isZero(n): # n == 0 - return _DecimalFns.ONE + return _DecimalFns.ZERO else: if isLog2: return _DecimalFns.__facultyStirlingLog2(n, nLog) @@ -193,7 +193,7 @@ def facultyLog(n, nLog, isLog2): = ln((n/e)^n) + ln(sqrt(2 * pi * n)) = n(ln(n/e)) + ln((2 * pi * n)^(1/2)) = n(ln(n) - ln(e)) + 0.5(ln(2) + ln(pi) + ln(n)) - = n(nLogE - 1) + 0.5(LOG_E_2 + LOG_E_PI + nLogE + = n(nLogE - 1) + 0.5(LOG_E_2 + LOG_E_PI + nLogE) ''' # in e-log space @staticmethod @@ -283,6 +283,8 @@ class _BirthdayProblemSolverChecked: @staticmethod def birthdayProblem(maybeD, dLog, maybeN, nLog, calcPrecision, dIsLog2): + # NOTE that dLog and nLog are 2 OR e base logarithms depending on input processing reflected in dIsLog2 + if (dIsLog2 and _DecimalFns.isLessThanOne(nLog)) or (not dIsLog2 and _DecimalFns.isLessThan(nLog, _DecimalFns.LOG_E_2)): # trivially, if you sample less than 2 times, the chance of a non-unique sample is 0% return (_DecimalFns.ZERO, _BirthdayProblemSolver.CalcPrecision.TRIVIAL) @@ -326,7 +328,7 @@ def birthdayProblem(maybeD, dLog, maybeN, nLog, calcPrecision, dIsLog2): @staticmethod - def birthdayProblemInv(maybeD, dLog, p, dIsLog2): + def birthdayProblemInv(maybeD, dLog, p, calcPrecision, dIsLog2): if _DecimalFns.isZero(p): # trivially, to have a 0% chance of picking a duplicate, just pick one sample (or 0) return (_DecimalFns.ZERO if dIsLog2 else _DecimalFns.ONE, _BirthdayProblemSolver.CalcPrecision.TRIVIAL) @@ -338,19 +340,39 @@ def birthdayProblemInv(maybeD, dLog, p, dIsLog2): else: # if d is too large to calculate adding 1 to it is negligible return (dLog if maybeD is None else _DecimalContext.ctx.add(maybeD, _DecimalFns.ONE), _BirthdayProblemSolver.CalcPrecision.TRIVIAL) + elif _DecimalFns.isZero(dLog): + # set size is 1 and p is neither 0 nor 1, for ANY p in between 0 and 1, 2 samples are required to get a non-unique sample + return (_DecimalFns.ONE if dIsLog2 else _DecimalFns.TWO, _BirthdayProblemSolver.CalcPrecision.TRIVIAL) else: + if calcPrecision in [_BirthdayProblemSolver.CalcPrecision.EXACT] and (maybeD is None): + # d is needed for this method + raise SolverException(SolverErrorCode.D_NEEDED_FOR_METHOD, calcPrecision) + # carry out the calculations _DecimalContext.adjustPrecision(maybeD.adjusted()) if maybeD is not None else _DecimalContext.adjustPrecision(dLog.adjusted()) - if _DecimalContext.isTooPrecise(): - _DecimalContext.adjustPrecision(dLog.adjusted()) + if calcPrecision == _BirthdayProblemSolver.CalcPrecision.EXACT: if _DecimalContext.isTooPrecise(): # with a too high precision, even the simplest calculation takes too long - raise SolverException(SolverErrorCode.TOO_HIGH_PRECISION, _BirthdayProblemSolver.CalcPrecision.TAYLOR_APPROX) - if dIsLog2: - return (_BirthdayProblemSolverChecked.__birthdayProblemInvTaylorApproxLog2(dLog, p), _BirthdayProblemSolver.CalcPrecision.TAYLOR_APPROX) + raise SolverException(SolverErrorCode.TOO_HIGH_PRECISION, calcPrecision) + if dIsLog2: + return (_DecimalContext.ctx.divide( + _DecimalContext.ctx.ln( + _BirthdayProblemSolverChecked.__birthdayProblemInvExact(maybeD, p) + ), + _DecimalFns.LOG_E_2 + ), _BirthdayProblemSolver.CalcPrecision.EXACT) + else: + return (_BirthdayProblemSolverChecked.__birthdayProblemInvExact(maybeD, p), _BirthdayProblemSolver.CalcPrecision.EXACT) else: - return (_DecimalContext.ctx.exp(_BirthdayProblemSolverChecked.__birthdayProblemInvTaylorApproxLogE(dLog, p)), _BirthdayProblemSolver.CalcPrecision.TAYLOR_APPROX) - + if _DecimalContext.isTooPrecise(): + _DecimalContext.adjustPrecision(dLog.adjusted()) + if _DecimalContext.isTooPrecise(): + # with a too high precision, even the simplest calculation takes too long + raise SolverException(SolverErrorCode.TOO_HIGH_PRECISION, _BirthdayProblemSolver.CalcPrecision.TAYLOR_APPROX) + if dIsLog2: + return (_BirthdayProblemSolverChecked.__birthdayProblemInvTaylorApproxLog2(dLog, p), _BirthdayProblemSolver.CalcPrecision.TAYLOR_APPROX) + else: + return (_DecimalContext.ctx.exp(_BirthdayProblemSolverChecked.__birthdayProblemInvTaylorApproxLogE(dLog, p)), _BirthdayProblemSolver.CalcPrecision.TAYLOR_APPROX) ######################################################################################################################################################################################################## ######################################################################################################################################################################################################## @@ -358,10 +380,10 @@ def birthdayProblemInv(maybeD, dLog, p, dIsLog2): # Internal drivers # # # ######################################################################################################################################################################################################## - ######################################################################################################################################################################################################## + ######################################################################################################################################################################################################## ''' - A frequent formula in the context of the birthday problem (or paradox) calculates that chance of no two items being equal (all items unique) when drawing d (picked) items from a population of n + A frequent formula in the context of the birthday problem (or paradox) calculates the chance of no two items being equal (all items unique) when drawing d (picked) items from a population of n (possibilities) items. Since we can choose unique items from n in (n)_d ways, and you can pick d items (any) from n in n^d, the formula for this is: ^P(n, d) = (n)_d / n^d @@ -369,8 +391,8 @@ def birthdayProblemInv(maybeD, dLog, p, dIsLog2): In log space, this is: lg(^P(n, d))= lg((n)_d / n^d) - = ln((n)_d) - lg(n^d) - = ln((n)_d) - d * lg(n) + = lg((n)_d) - lg(n^d) + = lg((n)_d) - d * lg(n) This result calculates the chance of all items unique, but most often, we are interested in the chance of there being at least one (trivially two) non-unique item(s) among dm P(n, d), which is why we take the complement of ^P(n, d) as the final result of these functions. @@ -415,6 +437,9 @@ def __birthdayProblemStirlingApproxLogE(d, dLogE, n): The formula is based on the observation that ln(n!) = ln(n) + ln(n - 1) + ... + ln(1): P(n, d) ~ 1 - e^(-(n^2/2d)) + + NOTE: this is really an approximation of the approximation; substitute n^2 for n * (n - 1) is possible + for a more accurate formula. This implies that @@ -442,6 +467,11 @@ def __birthdayProblemStirlingApproxLogE(d, dLogE, n): lg(-ln(^P(n, d))) ~ 2 * lg(n) - (lg(2) + lg(d)) = 2 * nLog2 - (1 + dLog2) + + Connecting back to the previous NOTE, 2 * lg(n) comes from lg(n^2), with the more accurate formula we would + instead have lg(n * (n - 1)) which equals lg(n) + lg(n - 1). This requires us to have n available. Note that + on a simplistic note, one might be aware of this improvement but still omit it since we then stay more true to + the approximation in use here. Therefore it has been omitted below. ''' # calculates result in base-2 logarithms (second level of logs) @@ -461,6 +491,11 @@ def __birthdayProblemTaylorApproxLog2(dLog2, nLog2): ln(-ln(^P(n, d))) ~ 2 * ln(n) - (ln(2) + ln(d)) = 2 * nLogE - (LOG_E_2 + dLogE) + + Again connecting back to the previous NOTE, 2 * ln(n) comes from ln(n^2), with the more accurate formula we + would instead have ln(n * (n - 1)) which equals ln(n) + ln(n - 1). This requires us to have n available. Note + that on a simplistic note, one might be aware of this improvement but still omit it since we then stay more + true to the approximation in use here. Therefore it has been omitted below. ''' # calculates result in natural logarithmic space @staticmethod @@ -474,6 +509,56 @@ def __birthdayProblemTaylorApproxLogE(dLogE, nLogE): prob = _DecimalContext.ctx.subtract(_DecimalFns.ONE, negProb) # complement return prob + ''' + The formula for the forward birthday problem is + + P(n, d) = 1 - (1 - 1/d) * (1 - 2/d) * ... * (1 - (n - 1)/d) + P(n, d) = 1 - ((d - 1)/d) * ((d - 2)/d) * ... * ((d - (n - 1))/d) + + Remember, this P(n, d) is the probability of a collision. + + which leads to + ^P(n, d) = ((d - 1)/d) * ((d - 2)/d) * ... * ((d - (n - 1))/d) + + which is the probability of all unique, which in log space is + + ln(^P(n, d)) = ln(((d - 1)/d) * ((d - 2)/d) * ... * ((d - (n - 1))/d)) + = ln((d - 1)/d) + ln((d - 2)/d) + ... + ln((d - (n - 1))/d) + = ln(d - 1) - ln(d) + ln(d - 2) - ln(d) + ... + ln(d - (n - 1)) - ln(d) + + We want to find n such that P(n, d) >= Pin (input p), e.g. the number of samples needed for the probability of a + collision to be equal to or higher than Pin. This means that we need to find n such that P^(n, d) < 1 - Pin. + The first such value for n entails a P(n, d) >= Pin. In log space y = ln(x), probabilities from 0 to 1 map to + y value from negative infinity to 0. We start with a single sample for which the probability of all samples + unique is 1. For every added sample, this probability decreases towards 1 - Pin until it is less than 1 - Pin + at which point we have found the relevant n. + + If we want to use a naive and exact method, we can calculate n numerically by adding one term to the sum until + ln(^P(n, d)) <= ln(1 - P) + + Note that for some reason, these calculations are faster in non-logarithmic space, perhaps due to the ln + operation taking more time than is won off of using add / subtract in logarithmic space rather than multiply + / divide in regular space. + + Also note that the first term in the above calculation is really 1 with a probability of all unique being 1. + Already when we calculate 1 * (d - 1 /d) we are assuming 2 samples. Thus the value of n is really reflecting + how many ADDITIONAL people to the initial one that is needed for the calculated probability. We thus make + up for this by adding 1 to the final result. + ''' + @staticmethod + def __birthdayProblemInvExact(d, p): + pInv = _DecimalContext.ctx.subtract(_DecimalFns.ONE, p) + n = _DecimalFns.ONE + currentPInv = _DecimalContext.ctx.divide(_DecimalContext.ctx.subtract(d, n), d) + while currentPInv > pInv: + n = _DecimalContext.ctx.add(n, _DecimalFns.ONE) + currentPInv = _DecimalContext.ctx.multiply( + currentPInv, + _DecimalContext.ctx.divide(_DecimalContext.ctx.subtract(d, n), d) + ) + final = _DecimalContext.ctx.add(n, _DecimalFns.ONE) + return final + ''' The formula for calculating the inverted birthday problem, namely how many times to sample from a set to reach a probability p of some non-unique samples, also uses the above Taylor approximation. @@ -511,6 +596,12 @@ def __birthdayProblemTaylorApproxLogE(dLogE, nLogE): lg(n(P, d)) ~ 0.5 * ( ln(-ln(1 - P)) + ln(2) + ln(d)) = 0.5 * ( ln(-ln(1 - p)) + LOG_E_2 + dLogE ) + + NOTE that this calculation uses the approximation of the approximation using n^2 instead of the more accurate + n * (n - 1). Of course this is necessary here since we need to be able to do the root calculation to arrive at + an answer. The forward formula however may take (it currently doesn't) the less coarse approximation into + account whereby the forward and backward calculations may differ when verifying n(P, d) in P(n, d) with the + Taylor approximation. ''' # with base e logarithms @staticmethod @@ -684,12 +775,12 @@ def resultTextBirthdayProblemInvNumbers(n, isLog2, prec = None): nLog2Text = "" nLog10Text = _BirthdayProblemTextFormatter.parenthesize(_BirthdayProblemNumberFormatter.toLog10ReprOrNone(n)) (prefix, nText) = _BirthdayProblemNumberFormatter.toIntegralRounded(n, ROUND_CEILING) - return prefix + nLog2Text + nText + nLog10Text + return prefix + nLog2Text + nText, nLog10Text @staticmethod def resultTextBirthdayProblemInv(n, isLog2, method, prec = None): - nText = _BirthdayProblemTextFormatter.resultTextBirthdayProblemInvNumbers(n, isLog2, prec) - return nText + _BirthdayProblemTextFormatter.parenthesize(_BirthdayProblemTextFormatter.methodToDescription(method, True)) + nText, nLog10Text = _BirthdayProblemTextFormatter.resultTextBirthdayProblemInvNumbers(n, isLog2, prec) + return (nText, nLog10Text, _BirthdayProblemTextFormatter.parenthesize(_BirthdayProblemTextFormatter.methodToDescription(method, True))) @staticmethod def headerTextBirthdayProblemNumbers(dLogOrNot, nLogOrNot, isLog2, prec = None): @@ -769,13 +860,13 @@ def sanitize(dOrDLog, nOrNLog, p, isBinary, isCombinations, isStirling, isTaylor if (p is None and nOrNLog is None) or (p is not None and nOrNLog is not None): raise SolverException(SolverErrorCode.BAD_INPUT, message = _BirthdayProblemInputHandler.illegalInputString() + ": please provide a non-None value for either '" + varMap.get("nOrDLog", "nOrDLog") + "' or '" + varMap.get("p", "p") + "' (not both)") - + if nOrNLog is not None: _BirthdayProblemInputHandler.checkDecimal(nOrNLog, varMap.get("nOrDLog", "nOrDLog")) if not isStirling and not isExact and not isTaylor and not isAll: - raise SolverException(SolverErrorCode.BAD_INPUT, message = _BirthdayProblemInputHandler.illegalInputString() + ": must set at least one of '" + varMap.get("isStirling", "isStirling") + "', '" + varMap.get("isTaylor", "isTaylor") + "', '" + varMap.get("isExact", "isExact") + "' or '" + varMap.get("isAll", "isAll") + "' when '" + varMap.get("nOrNLog", "nOrNLog") + "' is not None.") + raise SolverException(SolverErrorCode.BAD_INPUT, message = _BirthdayProblemInputHandler.illegalInputString() + ": must set at least one of '" + varMap.get("isStirling", "isStirling") + "', '" + varMap.get("isTaylor", "isTaylor") + "', '" + varMap.get("isExact", "isExact") + "' or '" + varMap.get("isAll", "isAll") + "'.") elif (isStirling or isExact or isTaylor) and isAll: - raise SolverException(SolverErrorCode.BAD_INPUT, message = _BirthdayProblemInputHandler.illegalInputString() + ": flag '" + varMap.get("isAll", "isAll") + "' was true and implicitly includes '" + varMap.get("isStirling", "isStirling") + "', '" + varMap.get("isTaylor", "isTaylor") + "' and '" + varMap.get("isExact", "isExact") + "' set to True which should then not be set to True.") + raise SolverException(SolverErrorCode.BAD_INPUT, message = _BirthdayProblemInputHandler.illegalInputString() + ": flag '" + varMap.get("isAll", "isAll") + "' was true and implicitly includes '" + varMap.get("isStirling", "isStirling") + "' (with -n), '" + varMap.get("isTaylor", "isTaylor") + "' and '" + varMap.get("isExact", "isExact") + "' set to True which should then not be set to True.") elif not _DecimalFns.isInteger(nOrNLog): raise SolverException(SolverErrorCode.BAD_INPUT, message = _BirthdayProblemInputHandler.illegalInputString(varMap.get("nOrDLog", "nOrDLog")) + ": please provide an integer") elif _DecimalFns.isLessThanZero(nOrNLog): @@ -783,8 +874,8 @@ def sanitize(dOrDLog, nOrNLog, p, isBinary, isCombinations, isStirling, isTaylor else: _BirthdayProblemInputHandler.checkDecimal(p, varMap.get("p", "p")) - if isStirling or isExact or isTaylor: - raise SolverException(SolverErrorCode.BAD_INPUT, message = _BirthdayProblemInputHandler.illegalInputString() + ": '" + varMap.get("isStirling", "isStirling") + "', '" + varMap.get("isTaylor", "isTaylor") + "' and '" + varMap.get("isExact", "isExact") + "' or '" + varMap.get("isAll", "isAll") +"' should only be non-False when '" + varMap.get("nOrDLog", "nOrDLog") + "' is not None (with '" + varMap.get("p", "p") + "' != None), Taylor approximation is always used).") + if isStirling: + raise SolverException(SolverErrorCode.BAD_INPUT, message = _BirthdayProblemInputHandler.illegalInputString() + ": '" + varMap.get("isStirling", "isStirling") + "' should only be non-False when '" + varMap.get("nOrDLog", "nOrDLog") + "' is not None.") elif _DecimalFns.isGreaterThanOne(p) or _DecimalFns.isLessThanZero(p): raise SolverException(SolverErrorCode.BAD_INPUT, message = _BirthdayProblemInputHandler.illegalInputString(varMap.get("p", "p")) + ": please provide a non-negative decimal number in the range [0.0, 1.0]") @@ -800,7 +891,7 @@ def setup(dOrDLog, nOrNLog, p, isBinary, isCombinations): try: if isCombinations: # d is the size of a set of items, calculate the number of permutations that is possible with it - if isBinary: + if isBinary: dLog = _DecimalFns.facultyLog(_DecimalContext.ctx.power(_DecimalFns.TWO, dOrDLog), dOrDLog, True) d = _DecimalContext.ctx.power(_DecimalFns.TWO, dLog) else: @@ -846,28 +937,28 @@ def parse(args = None): description="Treats the generalized birthday problem for arbitrary values.\n\nCalculates the generalized birthday problem, the probability P that, when sampling uniformly at random N times (with replacement)" + " from a set of D unique items, there is a non-unique item among the N samples. In the original birthday problem formulation, N is 23 and D is 366 (or 365) for a risk of P ≈ 0.5 = 50% of at least two people having the same" + " birthday.\n\nSupports calculating both the probability P from N and D (using exact method, exact method with Stirling's approximation in the calculation of faculties and Taylor approximation) and N" - + " from D and P (Taylor approximation only). Both approximations get asymptotically close to the exact result as D grows towards infinity. The exact method should not be used for larger numbers. For extremely small probabilities P, the exact method with Stirling's" - + " approximation used for faculties may become unstable as it involves many more different operations than the Taylor approximation which, each, results in small round-offs. Another source of error in this case arises" - + " from the use of Stirling's formula for two calculations of faculties (D! and (D - N)!). Since one of these ((D - N)!) diverges slightly more from the exact result than the other (D!), the difference between" - + " these (used for calculations in log space) might introduce small errors when P is extremely small. A good check to see whether the approximation in question is suffering or not is to compare it to the Taylor" + + " from D and P (using exact method and Taylor approximation). Both approximations get asymptotically close to the exact result as D grows towards infinity. The exact method should not be used for larger numbers. For extremely" + + " small probabilities P, the exact method with Stirling's approximation used for faculties may become unstable as it involves many more different operations than the Taylor approximation which, each, results in small round-offs." + + " Another source of error in this case arises from the use of Stirling's formula for two calculations of faculties (D! and (D - N)!). Since one of these ((D - N)!) diverges slightly more from the exact result than the other (D!)," + + " the difference between these (used for calculations in log space) might introduce small errors when P is extremely small. A good check to see whether the approximation in question is suffering or not is to compare it to the Taylor" + " approximation and see whether they match well.\n\nInputs D and N can be seen as literal input numbers or as exponents of base 2 (with -b flag). Furthermore, input D can be seen as a set of items from which" + " we should produce the D! permutations before proceeding with further calculations (with flag -c).\n\nExample usage:\n\n Example 1:\n Calculate the probability P of at least one non-unique birthday among N = 23 persons with all available methods:\n" - + " > python BirthdayProblem.py 366 -n 23 -a\n\n Example 2:\n Calculate, approximatively, the number of times N a deck of cards has to be shuffled to have a P = 50% probability of seeing a repeated shuffle:\n > python BirthdayProblem.py 52 -p 0.5 -c\n\n Example 3:\n" + + " > python BirthdayProblem.py 366 -n 23 -a\n\n Example 2:\n Calculate, with Taylor approximation, the number of times N a deck of cards has to be shuffled to have a P = 50% probability of seeing a repeated shuffle:\n > python BirthdayProblem.py 52 -p 0.5 -t -c\n\n Example 3:\n" + " Calculate, with approximative methods, the probability P of a collision in a 128-bit crypto when encrypting N = 2^64 = 18 446 744 073 709 551 616 blocks with the same key and output answer as a Json object with at most 5 decimals:\n > python BirthdayProblem.py 128 -n 64 -b -s -t -j --prec 5", formatter_class=argparse.RawTextHelpFormatter ) parser.add_argument('d', metavar=('D'), type=str, nargs=1, help='Input number D, the total number of unique items, or a number from which the total number of unique items can be derived, in the set we are sampling from.') parser.add_argument('-n', '--samples', metavar=('N'), type=str, help='Input number N, the number of samples, or a number from which the number of samples can be derived from, taken from the full set of D items. When present the probability P of at least one non-unique item among the samples will be calculated. Requires one of flags -e, -s, -t or -a to determine the desired precision(s) of the calculation.') - parser.add_argument('-p', '--probability', metavar=('P'), type=str, help='Input number P in [0.0, 1.0], the the probability of at least one non-unique item among the samples. When present the needed number of samples N will be approximated with Taylor series.') + parser.add_argument('-p', '--probability', metavar=('P'), type=str, help='Input number P in [0.0, 1.0], the the probability of at least one non-unique item among the samples. When present the needed number of samples N will be calculated. Requires one of flags -e, -t or -a to determine the desired precision(s) of the calculation.') parser.add_argument('-b', '--binary', dest='binary', action='store_const', const=True, default=False, help='Inputs D and N are seen as exponents with base 2') parser.add_argument('-c', '--combinations', dest='combinations', action='store_const', const=True, default=False, help="Input D is seen as a number of unique items in a set from which we can yield N! (factorial) different members for the resulting set of unique items from which we sample. The calculation of D! uses Stirling's approximation which might introduce a small error responsible for the difference in results with the same input with and without -c flag.") - parser.add_argument('-t', '--taylor', dest='taylor', action='store_const', const=True, default=False, help='Use Taylor approximation to calculate the birthday problem (only with flag -n) (best suited for extremely large numbers)') + parser.add_argument('-t', '--taylor', dest='taylor', action='store_const', const=True, default=False, help='Use Taylor approximation to calculate the birthday problem (best suited for extremely large numbers)') parser.add_argument('-s', '--stirling', dest='stirling', action='store_const', const=True, default=False, help='Use exact method but approximate faculty calculations with Stirling\'s formula (only with flag -n) (best suited up to extremely large numbers)') - parser.add_argument('-e', '--exact', dest='exact', action='store_const', const=True, default=False, help='Use exact method (only with flag -n) (WARNING! This method becomes too slow very quickly as calculations grow with complexity O(N!) where N is the size of the sampled set) (best suited for smaller numbers)') - parser.add_argument('-a', '--all', dest='all', action='store_const', const=True, default=False, help='Use all methods for the calculation (same as using flags -e, -s, -t when used with -n, otherwise it has no effect)') + parser.add_argument('-e', '--exact', dest='exact', action='store_const', const=True, default=False, help='Use exact method (WARNING! This method becomes too slow very quickly as calculations grow with complexity O(N!) where N is the size of the sampled set) (best suited for smaller numbers)') + parser.add_argument('-a', '--all', dest='all', action='store_const', const=True, default=False, help='Use all methods for the calculation (same as using flags -e, -s, -t when used with -n, otherwise same as using flags -e, -t)') parser.add_argument('-j', '--json', dest='json', action='store_const', const=True, default=False, help='Output results as a Json object') @@ -879,12 +970,12 @@ def parse(args = None): parser.error("Please provide one of flags -n or -p with corresponding argument.") elif args.probability is not None and args.samples is not None: parser.error("Please provide EITHER a flag -n or -p, not both.") - elif args.samples is not None and not args.stirling and not args.exact and not args.taylor and not args.all: - parser.error("Must set at least one of flags -s, -t, -e or -a together with -n.") - elif (args.stirling or args.exact or args.taylor) and args.samples is None: - parser.error("Flags -s, -t and -e should only be used with flag -n (with flag -p, Taylor approximation is always used).") + elif not args.stirling and not args.exact and not args.taylor and not args.all: + parser.error("Must set at least one of flags -s, -t, -e or -a.") + elif (args.stirling) and args.samples is None: + parser.error("Flag -s should only be used with flag -n.") elif (args.stirling or args.exact or args.taylor) and args.all: - parser.error("Flag -a was set and implicitly includes -s, -t and -e which should then not be used.") + parser.error("Flag -a was set and implicitly includes -s (with -n), -t and -e which should then not be used.") elif re.fullmatch(r'[\d]+', args.d[0]) is None: parser.error("Illegal input for D: please provide a non-negative integer with digits only") elif args.samples and re.fullmatch(r'[\d]+', args.samples) is None: @@ -950,26 +1041,30 @@ def solveText(d, dLog, n, nLog, p, pPercent, isBinary, isStirling, isTaylor, isE res = [] outputter = (lambda s: print(s)) if isMainProgram else (lambda s: res.append(s)) + lastMethodUsed = None + results = [] + # do the calculations based on mode if p is not None: outputter(_BirthdayProblemTextFormatter.headerTextBirthdayProblemInv(dLog if isBinary else d, p, pPercent, isBinary, prec)) - try: - (n, methodUsed) = _BirthdayProblemSolverChecked.birthdayProblemInv(d, dLog, p, isBinary) - except BaseException as e: - methodText = _BirthdayProblemTextFormatter.parenthesize(_BirthdayProblemTextFormatter.methodToShortDescription(_BirthdayProblemSolver.CalcPrecision.TAYLOR_APPROX)) - errorText = "N/A (Calculation failed: " + str(e).lower() + methodText + ")" - if isinstance(e, KeyboardInterrupt): - outputter(_BirthdayProblemTextFormatter.indented("N/A (Interrupted by user" + methodText + ")")) - elif isinstance(e, SolverException): - outputter(_BirthdayProblemTextFormatter.indented(errorText)) - else: - outputter(_BirthdayProblemTextFormatter.indented(errorText)) - else: - outputter(_BirthdayProblemTextFormatter.indented(_BirthdayProblemTextFormatter.resultTextBirthdayProblemInv(n, isBinary, methodUsed, prec))) + for (method, included) in [(_BirthdayProblemSolver.CalcPrecision.EXACT, isExact), (_BirthdayProblemSolver.CalcPrecision.TAYLOR_APPROX, isTaylor)]: + if (included or isAll) and lastMethodUsed != _BirthdayProblemSolver.CalcPrecision.TRIVIAL: + try: + (n, methodUsed) = _BirthdayProblemSolverChecked.birthdayProblemInv(d, dLog, p, method, isBinary) + lastMethodUsed = methodUsed + except BaseException as e: + methodText = _BirthdayProblemTextFormatter.parenthesize(_BirthdayProblemTextFormatter.methodToShortDescription(method)) + errorText = " (Calculation failed: " + str(e).lower() + methodText + ")" + if isinstance(e, KeyboardInterrupt): + results += [("N/A", "", " (Interrupted by user" + methodText + ")")] + elif isinstance(e, SolverException): + results += [("N/A", "", errorText)] + else: + results += [("N/A", "", errorText)] + else: + results += [_BirthdayProblemTextFormatter.resultTextBirthdayProblemInv(n, isBinary, methodUsed, prec)] else: outputter(_BirthdayProblemTextFormatter.headerTextBirthdayProblem(dLog if isBinary else d, nLog if isBinary else n, isBinary, prec)) - lastMethodUsed = None - results = [] for (method, included) in [(_BirthdayProblemSolver.CalcPrecision.EXACT, isExact), (_BirthdayProblemSolver.CalcPrecision.STIRLING_APPROX, isStirling), (_BirthdayProblemSolver.CalcPrecision.TAYLOR_APPROX, isTaylor)]: if (included or isAll) and lastMethodUsed != _BirthdayProblemSolver.CalcPrecision.TRIVIAL: try: @@ -987,11 +1082,11 @@ def solveText(d, dLog, n, nLog, p, pPercent, isBinary, isStirling, isTaylor, isE results += [("N/A", "", errorText)] else: results += [_BirthdayProblemTextFormatter.resultTextBirthdayProblem(p, pPercent, methodUsed, prec)] - # map every value for results and log10 results to the length of the string (=> an array of tuples), then spred it with * so that we add tuples as vararg input to zip which will then create two - # lists, one with all first value, and one with all last values. For each of these arrays, we take the maximum and then we have the length of the longest res text and the length of the longest log 10 res text - (maxLenRes, maxLenLog10Repr) = map(lambda l: max(l), zip(*map(lambda tup: (len(tup[0]), len(tup[1])), results))) - for (resText, log10Repr, methodText) in results: - outputter(_BirthdayProblemTextFormatter.indented(resText.ljust(maxLenRes, " ") + log10Repr.ljust(maxLenLog10Repr, " ") + methodText)) + # map every value for results and log10 results to the length of the string (=> an array of tuples), then spread it with * so that we add tuples as vararg input to zip which will then create two + # lists, one with all first value, and one with all last values. For each of these arrays, we take the maximum and then we have the length of the longest res text and the length of the longest log 10 res text + (maxLenRes, maxLenLog10Repr) = map(lambda l: max(l), zip(*map(lambda tup: (len(tup[0]), len(tup[1])), results))) + for (resText, log10Repr, methodText) in results: + outputter(_BirthdayProblemTextFormatter.indented(resText.ljust(maxLenRes, " ") + log10Repr.ljust(maxLenLog10Repr, " ") + methodText)) if(not isMainProgram): return "\n".join(res) @@ -1004,21 +1099,25 @@ def solveJson(d, dLog, n, nLog, p, pPercent, isBinary, isStirling, isTaylor, isE dText, pText = _BirthdayProblemTextFormatter.headerTextBirthdayProblemInvNumbers(dLog if isBinary else d, p, pPercent, isBinary, prec) res['d'] = dText res['p'] = pText - try: - (n, methodUsed) = _BirthdayProblemSolverChecked.birthdayProblemInv(d, dLog, p, isBinary) - except BaseException as e: - methodKey = _BirthdayProblemTextFormatter.methodToText(_BirthdayProblemSolver.CalcPrecision.TAYLOR_APPROX).lower() - errorMessage = str(e).lower() - if isinstance(e, KeyboardInterrupt): - res['results'][methodKey] = { 'error': 'interrupted' } - elif isinstance(e, SolverException): - res['results'][methodKey] = { 'error': errorMessage } - else: - res['results'][methodKey] = { 'error': errorMessage } - else: - methodKey = _BirthdayProblemTextFormatter.methodToText(methodUsed).lower() - n = _BirthdayProblemTextFormatter.resultTextBirthdayProblemInvNumbers(n, isBinary, prec) - res['results'][methodKey] = { 'result' : n } + lastMethodUsed = None + for (method, included) in [(_BirthdayProblemSolver.CalcPrecision.EXACT, isExact), (_BirthdayProblemSolver.CalcPrecision.TAYLOR_APPROX, isTaylor)]: + if (included or isAll) and lastMethodUsed != _BirthdayProblemSolver.CalcPrecision.TRIVIAL: + try: + (n, methodUsed) = _BirthdayProblemSolverChecked.birthdayProblemInv(d, dLog, p, method, isBinary) + lastMethodUsed = methodUsed + except BaseException as e: + methodKey = _BirthdayProblemTextFormatter.methodToText(_BirthdayProblemSolver.CalcPrecision.TAYLOR_APPROX).lower() + errorMessage = str(e).lower() + if isinstance(e, KeyboardInterrupt): + res['results'][methodKey] = { 'error': 'interrupted' } + elif isinstance(e, SolverException): + res['results'][methodKey] = { 'error': errorMessage } + else: + res['results'][methodKey] = { 'error': errorMessage } + else: + methodKey = _BirthdayProblemTextFormatter.methodToText(methodUsed).lower() + n = "".join(_BirthdayProblemTextFormatter.resultTextBirthdayProblemInvNumbers(n, isBinary, prec)) + res['results'][methodKey] = { 'result' : n } else: dText, nText = _BirthdayProblemTextFormatter.headerTextBirthdayProblemNumbers(dLog if isBinary else d, nLog if isBinary else n, isBinary, prec) res['d'] = dText diff --git a/DEVELOPERS_NOTES.md b/DEVELOPERS_NOTES.md new file mode 100644 index 0000000..4d9b838 --- /dev/null +++ b/DEVELOPERS_NOTES.md @@ -0,0 +1,8 @@ + +* `Decimal`'s are immutable but in some places, an input is wrapped in `Decimal(x)`. This is likely because this input + can sometimes be a regular number OR has been so historically and the creation of a `Decimal` has been left. +* Adjustments of `Decimal`'s via `adjustPrecisions` is an attempt to allow a certain number of decimals to the right of + the comma so that, depending on the integer part, a `Decimal` can have its precision increased or decreased at + different times after some processing has been done. If this results in a too big number, then the precision needed is + too big and we can't carry out the calculations. This limit is set at 1000 digits (out of which 100 at most are to the + right of the comma). Larger numbers than this will result in the calculations failing. diff --git a/DataTest.py b/DataTest.py index 8446677..480590e 100644 --- a/DataTest.py +++ b/DataTest.py @@ -10,13 +10,16 @@ testData = [ ['1 -p 1.0 -a', True, { 'd': '1', 'p': '100%', 'results': { 'trivial': {'result': '2'} } }], ['1 -p 0.0 -a', True, { 'd': '1', 'p': '0%', 'results': { 'trivial': {'result': '1'} } }], - ['1 -p 0.5 -a', True, { 'd': '1', 'p': '50%', 'results': { 'taylor': {'result': '2'} } }], - ['1000000000 -p 0.0000001', True, { 'd': '1000000000 (=10^9)', 'p': '0.00001% (=10^-7)', 'results': { 'taylor': {'result': '15'} } }], + ['1 -p 0.5 -a', True, { 'd': '1', 'p': '50%', 'results': { 'trivial': {'result': '2'} } }], + ['1000000000 -p 0.0000001 -t', True, { 'd': '1000000000 (=10^9)', 'p': '0.00001% (=10^-7)', 'results': { 'taylor': {'result': '15'} } }], ['1 -n 1 -a', True, { 'd': '1', 'n': '1', 'results': { 'trivial': {'result': '0%'} } }], ['1 -n 0 -a', True, { 'd': '1', 'n': '0', 'results': { 'trivial': {'result': '0%'} } }], ['1 -n 2 -a', True, { 'd': '1', 'n': '2', 'results': { 'trivial': {'result': '100%'} } }], + ['69 -p 0.5 -a', True, { 'd': '69', 'p': '50%', 'results': { 'exact': {'result': '11'}, 'taylor': {'result': '10'} } }], + ['83 -p 0.5 -a', True, { 'd': '83', 'p': '50%', 'results': { 'exact': {'result': '12'}, 'taylor': {'result': '11'} } }], + ['1000000000 -p 0.5 -a', True, { 'd': '1000000000 (=10^9)', 'p': '50%', 'results': { 'exact': {'result': '37234'}, 'taylor': {'result': '37233'} } }], ['366 -n 23 -a', True, { 'd': '366', 'n': '23', 'results': { 'exact': {'result': '≈50.6323011819%'}, 'taylor': {'result': '≈51.4549326419%'}, 'stirling': {'result': '≈50.6315474495%'} } }], - ['366 -p 0.5', True, { 'd': '366', 'p': '50%', 'results': { 'taylor': {'result': '23'} } }], + ['366 -p 0.5 -a', True, { 'd': '366', 'p': '50%', 'results': { 'taylor': {'result': '23'}, 'exact': {'result': '23'} } }], [ '6274264876827642864872634872364782634 -n 2376287346287353638 -s -t', True, @@ -32,7 +35,7 @@ ['128 -n 0 -b -s -t', True, { 'd': '2^128', 'n': '2^0', 'results': { 'trivial': {'result': '0%'} } }], ['128 -n 129 -b -s -t', True, { 'd': '2^128', 'n': '2^129', 'results': { 'trivial': {'result': '100%'} } }], ['128 -n 64 -b -s -t', True, { 'd': '2^128', 'n': '2^64', 'results': { 'stirling': {'result': '≈39.3469340287%'}, 'taylor': {'result': '≈39.3469340287%'} } }], - ['128 -p 0.5 -b', True, { 'd': '2^128', 'p': '50%', 'results': { 'taylor': {'result': '≈2^64.2356168135'} } }], + ['128 -p 0.5 -b -t', True, { 'd': '2^128', 'p': '50%', 'results': { 'taylor': {'result': '≈2^64.2356168135'} } }], [ '2000000 -n 1000000 -b -s -t', True, @@ -45,7 +48,7 @@ } } ], - ['2000000 -p 0.5 -b', True, { 'd': '2^2000000', 'p': '50%', 'results': { 'taylor': {'result': '≈2^1000000.2356168135'} } }], + ['2000000 -p 0.5 -b -t', True, { 'd': '2^2000000', 'p': '50%', 'results': { 'taylor': {'result': '≈2^1000000.2356168135'} } }], [ '8 -n 3 -b -a', True, @@ -73,7 +76,7 @@ } ], [ - '52 -p 0.1 -c', + '52 -p 0.1 -c -t', True, { 'd': '≈80529020383886612857810199580012764961409004334781435987268084328737 (≈8*10^67)', @@ -84,7 +87,7 @@ } ], [ - '52 -p 0.5 -c', + '52 -p 0.5 -c -t', True, { 'd': '≈80529020383886612857810199580012764961409004334781435987268084328737 (≈8*10^67)', diff --git a/OutputTest.py b/OutputTest.py index 67758e1..c23df9b 100644 --- a/OutputTest.py +++ b/OutputTest.py @@ -22,10 +22,10 @@ ['1 -p 0.5 -a', True, [ 'The number of samples, sampled uniformly at random from a set of 1 items, needed to have at least a 50% chance of a non-unique sample is:', - ' 2 (Taylor series approximation used in main calculation)' + ' 2 (Trivial solution)' ] ], - ['1000000000 -p 0.0000001', True, + ['1000000000 -p 0.0000001 -t', True, [ 'The number of samples, sampled uniformly at random from a set of 1000000000 (=10^9) items, needed to have at least a 0.00001% (=10^-7) chance of a non-unique sample is:', ' 15 (Taylor series approximation used in main calculation)' @@ -49,6 +49,27 @@ ' 100% (Trivial solution)' ] ], + ['69 -p 0.5 -a', True, + [ + 'The number of samples, sampled uniformly at random from a set of 69 items, needed to have at least a 50% chance of a non-unique sample is:', + ' 11 (Exact method)', + ' 10 (Taylor series approximation used in main calculation)' + ] + ], + ['83 -p 0.5 -a', True, + [ + 'The number of samples, sampled uniformly at random from a set of 83 items, needed to have at least a 50% chance of a non-unique sample is:', + ' 12 (Exact method)', + ' 11 (Taylor series approximation used in main calculation)' + ] + ], + ['1000000000 -p 0.5 -a', True, + [ + 'The number of samples, sampled uniformly at random from a set of 1000000000 (=10^9) items, needed to have at least a 50% chance of a non-unique sample is:', + ' 37234 (Exact method)', + ' 37233 (Taylor series approximation used in main calculation)' + ] + ], ['366 -n 23 -a', True, [ 'The probability of finding at least one non-unique sample among 23 samples, sampled uniformly at random from a set of 366 items, is:', @@ -57,9 +78,10 @@ ' ≈51.4549326419% (Taylor series approximation used in main calculation (removes need for factorial calculation))' ] ], - ['366 -p 0.5', True, + ['366 -p 0.5 -a', True, [ 'The number of samples, sampled uniformly at random from a set of 366 items, needed to have at least a 50% chance of a non-unique sample is:', + ' 23 (Exact method)', ' 23 (Taylor series approximation used in main calculation)' ] ], @@ -89,7 +111,7 @@ ' ≈39.3469340287% (Taylor series approximation used in main calculation (removes need for factorial calculation))' ] ], - ['128 -p 0.5 -b', True, + ['128 -p 0.5 -b -t', True, [ 'The number of samples, sampled uniformly at random from a set of 2^128 items, needed to have at least a 50% chance of a non-unique sample is:', ' ≈2^64.2356168135 (Taylor series approximation used in main calculation)' @@ -102,7 +124,7 @@ ' ≈39.3469340287% (Taylor series approximation used in main calculation (removes need for factorial calculation))' ] ], - ['2000000 -p 0.5 -b', True, + ['2000000 -p 0.5 -b -t', True, [ 'The number of samples, sampled uniformly at random from a set of 2^2000000 items, needed to have at least a 50% chance of a non-unique sample is:', ' ≈2^1000000.2356168135 (Taylor series approximation used in main calculation)' @@ -124,13 +146,13 @@ ' ≈11.7503097415% (Taylor series approximation used in main calculation (removes need for factorial calculation))' ] ], - ['52 -p 0.1 -c', True, + ['52 -p 0.1 -c -t', True, [ 'The number of samples, sampled uniformly at random from a set of ≈80529020383886612857810199580012764961409004334781435987268084328737 (≈8*10^67) items, needed to have at least a 10% chance of a non-unique sample is:', ' 4119363813276486714957808853108064 (≈4*10^33) (Taylor series approximation used in main calculation)' ] ], - ['52 -p 0.5 -c', True, + ['52 -p 0.5 -c -t', True, [ 'The number of samples, sampled uniformly at random from a set of ≈80529020383886612857810199580012764961409004334781435987268084328737 (≈8*10^67) items, needed to have at least a 50% chance of a non-unique sample is:', ' 10565837726592754214318243269428637 (≈10^34) (Taylor series approximation used in main calculation)' diff --git a/README.md b/README.md index e6e6002..913da78 100644 --- a/README.md +++ b/README.md @@ -17,8 +17,8 @@ In mathematical terms, this can be expressed as since the probability of picking all of `N` unique is equal to the number of ways to pick `N` unique samples divided by number of ways to pick any `N` samples. This, of course, given the assumption that all `D` items are equally probable. -The project supports calculating both the probability `P` from `N` and `D` (using exact method, exact method with Stirling's approximation in the calculation of faculties and Taylor approximation) and -`N` from `D` and `P` (Taylor approximation only). Both approximations get asymptotically close to the exact result as `D` grows towards infinity. The exact method should not be used for larger +The project supports calculating both the probability `P` from `N` and `D` (using exact method, exact method with Stirling's approximation in the calculation of faculties or Taylor approximation) and +`N` from `D` and `P` (using exact method or Taylor approximation). Both approximations get asymptotically close to the exact result as `D` grows towards infinity. The exact method should not be used for larger numbers. For extremely small probabilities `P`, the exact method with Stirling's approximation used for faculties may become unstable as it involves many more different operations than the Taylor approximation which, each, results in small round-offs. Another source of error in this case arises from the use of Stirling's formula for two calculations of faculties (`D!` and `(D - N)!`). Since one of these (`(D - N)!`) diverges slightly more from the exact result than the other (`D!`), the difference between these (used for calculations in log space) might @@ -27,19 +27,19 @@ whether they match well. ### Parameter legend -Name | Type | Effect | CLI flag ---- | --- | --- | --- -`D` | integer | The size of the set to sample from | - -`N` | integer | The number of samples sampled from `D` | `-n` -`P` | floating point number | The probability of a non-unique sample in `N` | `-p` -`binary` | boolean | Whether to interpret `D` and `N` as base-2 logarithms | `-b` -`combinations` | boolean | Whether to interpret `D` as the size of a set from which we must yield the actual size, `D!`, of the set to sample from | `-c` -`taylor` | boolean | Whether to calculate `P` with Taylor approximation | `-t` -`stirling` | boolean | Whether to calculate `P` with exact method using Stirling's approximation in calculation of faculties | `-s` -`exact` | boolean | Whether to calculate `P` with exact method | `-e` -`all` | boolean | Whether to calculate `P` with all methods (implies `-s -t -e`) | `-a` -`json` | boolean | Whether to output answer as a Json object or as text | `-j` -`prec` | integer | Decimals in the solution where applicable (in [0, 10] with default 10) | `--prec` +| Name | Type | Effect | CLI flag | +|----------------| --- |-------------------------------------------------------------------------------------------------------------------------| --- | +| `D` | integer | The size of the set to sample from | - | +| `N` | integer | The number of samples sampled from `D` | `-n` | +| `P` | floating point number | The probability of a non-unique sample in `N` | `-p` | +| `binary` | boolean | Whether to interpret `D` and `N` as base-2 logarithms | `-b` | +| `combinations` | boolean | Whether to interpret `D` as the size of a set from which we must yield the actual size, `D!`, of the set to sample from | `-c` | +| `taylor` | boolean | Whether to calculate `P` or `N` with Taylor approximation | `-t` | +| `stirling` | boolean | Whether to calculate `P` with exact method using Stirling's approximation in calculation of faculties | `-s` | +| `exact` | boolean | Whether to calculate `P` or `N` with exact method | `-e` | +| `all` | boolean | Whether to calculate `P` or `N` with all methods (implies `-s -t -e` for `P` and `-t -e` for `N`) | `-a` | +| `json` | boolean | Whether to output answer as a Json object or as text | `-j` | +| `prec` | integer | Decimals in the solution where applicable (in [0, 10] with default 10) | `--prec` | ## Versions @@ -69,7 +69,7 @@ Calculate the probability `P` of at least one non-unique birthday among `N`= 23 Calculate, approximatively, the number of times `N` a deck of cards has to be shuffled to have a `P` = 50% probability of seeing a repeated shuffle: - > python BirthdayProblem.py 52 -p 0.5 -c + > python BirthdayProblem.py 52 -p 0.5 -c -t #### Example 3: @@ -96,13 +96,13 @@ The following shows example usage of this project in another application: CalcPrecision = Solver.CalcPrecision [p, pMethod] = Solver.solveForP(Decimal('366'), Decimal('23'), False, False, CalcPrecision.EXACT) - [n, nMethod] = Solver.solveForN(Decimal('52'), Decimal('0.5'), False, True) + [n, nMethod] = Solver.solveForN(Decimal('52'), Decimal('0.5'), False, True, CalcPrecision.TAYLOR) The functions to call have signatures def solveForP(dOrDLog, nOrNLog, isBinary, isCombinations, method) - def solveForN(dOrDLog, p, isBinary, isCombinations) + def solveForN(dOrDLog, p, isBinary, isCombinations, method) and may throw exceptions. @@ -147,5 +147,13 @@ Elias Lousseief (2020) * Corrected precision bug in `_BirthdayProblemSolverChecked.birthdayProblemInv`. * Corrected minor bug in `_BirthdayProblemTextFormatter.methodToText`. * Simplified `_BirthdayProblemTextFormatter.methodToText` and `_BirthdayProblemTextFormatter.methodToDescription`. - + +* *v. 1.4* + * Added exact method for calculating `N` given `D` and `P` using a numerical approach, this means that from now on + multiple solution strategies can be used for this calculation as well (earlier this calculation always used Taylor + approximation). + * Fixed bug in method `facultyLog` for when input is 0. Since 0! = 1 and the return value is in log space, the + correct answer is 0 and not 1. + * Added trivial use case for calculating `N` using `D` and `P` when `D` is 1 and `P` is neither 0 nor 1 (in this case + the answer is always 2).