import random

import numpy as np
import pandas as pd

# Show all the data
pd.options.display.max_rows = 200
pd.options.display.max_columns = 50
pd.options.display.max_colwidth = 1000


all_applicants = pd.read_excel('Applicants.xlsx')
all_applicants.shape

(109, 17)


# Don't run this code in production
# ------------------------------------------------------------

# Redact personal data
columns_to_redact = [
    'Name', 'Preferred Name', 'Organisation', 'Email address', 'Role', 'Research interests',
    'Please confirm you can meet the following requirements', 'JASMIN username', 'GitHub username',
    'Please state the limitations on your availability', 'Any other information',
]
for column in columns_to_redact:
    all_applicants[column] = "*** redacted ***"

# Shorten project names
from itertools import chain
first_choices = all_applicants['First-choice projects of interest (tick all that apply)'].str.strip(';').str.split(';')
projects = dict.fromkeys(chain(*first_choices))  # Want to do set(chain(*first_choices)) but the order is not determinate
for replace_with, to_replace in enumerate(projects, start=1):
    replace_with = f'Project {replace_with} title...'
    all_applicants['First-choice projects of interest (tick all that apply)'] = (
        all_applicants['First-choice projects of interest (tick all that apply)']
        .str.replace(to_replace, replace_with, regex=False)
    )
    all_applicants['Second-choice projects of interest (tick all that apply)'] = (
        all_applicants['Second-choice projects of interest (tick all that apply)']
        .str.replace(to_replace, replace_with, regex=False)
    )
    
# ------------------------------------------------------------


#included = (
#    all_applicants['Email address'].str.endswith('.ac.uk') |
#    all_applicants['Email address'].str.endswith('@metoffice.gov.uk')
#)
included = all_applicants['Include'] == 'Yes'

applicants = all_applicants[included].copy()
applicants.shape

(90, 17)


applicants.head(2)


projects = [
    "Project 1 title...",
    "Project 2 title...",
    "Project 3 title...",
    "Project 4 title...",
    "Project 5 title...",
    "Project 6 title...",
    "Project 7 title...",
    "Project 8 title...",
    "Project 9 title...",
    "Project 10 title...",
    "Project 11 title...",
]


first_choices = applicants['First-choice projects of interest (tick all that apply)'].str.strip(';').str.split(';')
second_choices = applicants['Second-choice projects of interest (tick all that apply)'].str.strip(';').str.split(';')

first_choices.head(2)

0    [Project 1 title..., Project 2 title...]
1                        [Project 1 title...]
Name: First-choice projects of interest (tick all that apply), dtype: object


len(applicants) / len(projects)

8.181818181818182


def assign_participants(ideal_group_sizes, random_state=1234):
    random.seed(random_state)
    
    assignments = pd.Series(
        index=first_choices.index,
        dtype=pd.CategoricalDtype(categories=projects),
    )

    # Start by trying to assign those people who only chose one first-choice project
    for applicant, choices in first_choices.items():
        if len(choices) == 1:
            assignments.loc[applicant] = choices[0]
    
    # Then randomly assign first choice projects, starting with those who chose two
    # first-choice projects. Weight random choices by inverse exponential of number
    # of existing assignments to that project
    def get_weights(choices, assignments):
        group_sizes = assignments.value_counts()
        weights = [
            # Penalise if greater than ideal group size
            1 / np.exp(group_sizes[choice] - ideal_group_sizes[choice])
            for choice in choices
        ]

        # Normalise
        return [weight / sum(weights) for weight in weights]

    for applicant, choices in first_choices.items():
        if len(choices) == 2:
            choice = random.choices(choices, get_weights(choices, assignments))[0]
            assignments.loc[applicant] = choice

    # Now continue for applicants with any remaining number of first-choices
    for applicant, choices in first_choices.items():
        if len(choices) > 2:
            choice = random.choices(choices, get_weights(choices, assignments))[0]
            assignments.loc[applicant] = choice
            
    return assignments


assignments = assign_participants({
    'Project 1 title...': 14,
    'Project 2 title...': 13,
    'Project 3 title...': 8,
    'Project 4 title...': 12,
    'Project 5 title...': 6,
    'Project 6 title...': 8,
    'Project 7 title...': 7,
    'Project 8 title...': 6,
    'Project 9 title...': 8,
    'Project 10 title...': 7,
    'Project 11 title...': 9,
})

assignments.value_counts()

Project 1 title...     13
Project 4 title...     13
Project 2 title...     12
Project 5 title...      7
Project 6 title...      7
Project 7 title...      7
Project 10 title...     7
Project 3 title...      6
Project 8 title...      6
Project 9 title...      6
Project 11 title...     6
dtype: int64


applicants['Assigned project'] = assignments
applicants.head(2)

Assign applicants to projects¶

Load data¶

Extract chosen projects¶

Assignments¶

Acknowledgements¶

	Name	Preferred Name	Pronouns	Organisation	Email address	Role	Research interests	Include	First-choice projects of interest (tick all that apply)	Second-choice projects of interest (tick all that apply)	Please confirm you can meet the following requirements	Do you already have a JASMIN account?	JASMIN username	GitHub username	Happy for name/institution to go on website?	Please state the limitations on your availability	Any other information
0	* redacted *	* redacted *	He/Him	* redacted *	* redacted *	* redacted *	* redacted *	Yes	Project 1 title...;Project 2 title...;	Project 7 title...;Project 4 title...;	* redacted *	Yes	* redacted *	* redacted *	Yes	* redacted *	* redacted *
1	* redacted *	* redacted *	He/him	* redacted *	* redacted *	* redacted *	* redacted *	Yes	Project 1 title...;	Project 7 title...;Project 10 title...;	* redacted *	No - we will allocate you a temporary account	* redacted *	* redacted *	Yes	* redacted *	* redacted *