编程原理：QUIZ_5

quiz_5_this_year

https://blog.stellariumimpl.cloud/1999/09/14/comp9021_quiz/
# Written by *** for COMP9021
#
# Prompts the user for two capitalised strings of letters,
# say s1 and s2, then for two years in the range 1947--2021,
# say Y1 and Y2, with s1 and s2 and with Y1 and Y2 being separated
# by at least one space, and with possibly any extra spaces
# beween s1 and s2, between Y1 and Y2, at the start of either input,
# and at the end of either input.
# - s1 can be lexicographically smaller than or equal to s2
#   or the other way around;
# - Y1 can be smaller than or equal to Y2 or the other way around.
#
# Outputs an error message if input is incorrect.
# Otherwise, finds out amongst the first name that starts with s1
# and the last name that starts with s2, which name has been given
# as both a female name and a male name in a year between Y1 and Y2
# included. If there is such a name and year, then outputs all such
# names and years for which the absolute value of the difference
# between
# - the ratio defined as the count of the name as a female name
#   in that year over the count of all female names in that year,
# - the ratio defined as the count of the name as a male name
#   in that year over the count of all male names in that year,
# is minimal (so essentially, the popularities in that year
# of the name as a female name and of the name as a male name
# are as close as possible).
# Outputs the name, the year, and both ratios as percentages
# printed out with 5 digits after the decimal point.
# In case there are many solutions (that is, same minimal
# difference in popularities), then outputs all solutions
# in increasing lexicographic order of names, and for
# a given name, in increasing lexicographic order of years.
#
# The directory named names is stored in the working directory.
#
# IF YOU USE ABSOLUTE PATHS, YOUR PROGRAM CAN ONLY FAIL TO RUN PROPERLY
# ON MY MACHINE AND YOU WILL SCORE 0 TO THE QUIZ, WITH NO CHANCE FOR YOU
# TO FIX THIS MISTAKE AFTER RESULTS HAVE BEEN RELEASED.
#
# YOU CANNOT USE pandas FOR THIS QUIZ; IF YOU DO, YOU WILL SCORE 0
# TO THE QUIZ.
import os
import sys
import timeit
from pathlib import Path
import csv
from collections import defaultdict

# INSERT YOUR CODE HERE
try:
    letters = input('Enter two capitalised strings of letters: ').split()
    if len(letters) != 2:
        raise ValueError
    letter1 = letters[0]
    letter2 = letters[1]

    if len(str(letter1)) > 1:
        if not (letter1[0].isupper() and letter1[1:].islower()):
            raise ValueError
    if len(str(letter2)) > 1:
        if not (letter2[0].isupper() and letter2[1:].islower()):
            raise ValueError
    if letter1.isdigit() or letter2.isdigit():
        raise ValueError
    if not letter1 or not letter2 or letter1.islower() or letter2.islower():
        raise ValueError
except ValueError:
    print('Incorrect input, leaving it there.')
    sys.exit()

try:
    integers = input('Enter two integers between 1947 and 2021: ').split()
    integers = list(integers)
    # print(integers)
    if not integers or len(integers) != 2:
        raise ValueError
    integer1 = integers[0]
    integer2 = integers[1]
    # print(integer1, integer2)
    if int(integer1) < 1947 or int(integer1) > 2021 or int(integer2) < 1947 or int(integer2) > 2021 or len(
            integer1) != 4 or len(integer2) != 4:
        raise ValueError
except ValueError:
    print('Incorrect input, leaving it there.')
    sys.exit()

# 統計當前年份下，某性別某個名字人數（之和）占某個性別總人數的比率

isAppear = False
upper_bound = int(max(integer1, integer2))
lower_bound = int(min(integer1, integer2))

names_dir = Path('names')

if letter1 > letter2:
    letter1, letter2 = letter2, letter1


# 統計某一年下男生女生的各自數量
def statistic_amount():
    gender_per_year = {}  # 外部字典
    for filename in sorted(names_dir.glob('*.txt')):
        if int(filename.name[3:7]) <= upper_bound and int(filename.name[3:7]) >= lower_bound:
            year = int(filename.name[3: 7])
            with open(filename, 'r', newline='') as file:
                csv_file = csv.reader(file)
                for row in csv_file:
                    if len(row) != 3:
                        # 处理不匹配的行，可以打印警告或执行其他操作
                        continue
                    name, gender, count = row
                    if count.strip():
                        count = int(count)

                        if year not in gender_per_year:
                            gender_per_year[year] = {}  # 内部字典
                        if gender not in gender_per_year[year]:
                            gender_per_year[year][gender] = 0
                        gender_per_year[year][gender] += count
    return gender_per_year

def load_data(lower_bound, upper_bound, letter2):
    name_list = []
    global isAppear
    isAppear = False

    for filename in names_dir.glob('*.txt'):
        year = int(filename.name[3:7])

        if lower_bound <= year <= upper_bound:
            with open(filename, 'r', newline='') as file:
                csv_file = csv.reader(file)
                for row in csv_file:
                    if len(row) != 3:
                        continue
                    name, gender, count = row
                    name_list.append([name, gender, count, year])
                    if name.startswith(letter2):
                        isAppear = True

    return name_list, isAppear

def sort_name_list(name_list, letter2, isAppear):
    not_startswith_letter2 = []
    remaining_items = []
    insert_index = 0

    for i, item in enumerate(name_list):
        if item[0].startswith(letter2):
            remaining_items.append(item)
        else:
            not_startswith_letter2.append(item)
    # print(remaining_items)
    if isAppear:
        # 对不以 letter2 开头的名字按字母表顺序排序
        not_startswith_letter2.sort(key=lambda x: x[0])

        # 寻找插入点
        for i, item in enumerate(not_startswith_letter2):
            if item[0] < letter2:  # 寻找插入点
                insert_index = i
        # 插入剩余的名字
        name_list = not_startswith_letter2[:insert_index] + remaining_items
    else:
        name_list = sorted(name_list, key=lambda x: x[0])

    return name_list

def statistic_all_year_name():
    name_list, isAppear = load_data(lower_bound, upper_bound, letter2)
    name_list = sort_name_list(name_list, letter2, isAppear)
    return name_list



def statistic_year_name():
    start_processing = False
    find_over_name = False
    name_dict = defaultdict(lambda: {'female': 0, 'male': 0})
    signal_over_name = None

    name_list = statistic_all_year_name()
    for item in name_list:
        name, gender, count, year = item

        if str(name).startswith(letter1) and start_processing == False:
            start_processing = True
        if start_processing == False:
            continue

        if isAppear == False:
            end_guard = chr(ord(letter2[0]) + 1)
            if str(name).startswith(end_guard):
                signal_over_name = str(name)
                find_over_name = True


        elif str(name).startswith(letter2) and find_over_name == False:

            signal_over_name = str(name)
            find_over_name = True
            gender_mark = gender

        if count.strip():  # 检查字符串是否为空或只包含空格
            count = int(count)
            if gender == 'F':
                name_dict[(year, name)]['female'] += count
            elif gender == 'M':
                name_dict[(year, name)]['male'] += count

        if isAppear == True:
            if str(name) == signal_over_name and gender != gender_mark:
                break
        else:
            if str(name) == signal_over_name:
                break
    return name_dict


def statistic_year_percentage():
    gender_per_year = statistic_amount()
    name_dict = statistic_year_name()
    name_ratio_dict = {}
    for (year, name), data in name_dict.items():
        total_female = gender_per_year[year].get('F', 0)
        total_male = gender_per_year[year].get('M', 0)

        if total_female > 0 or total_male > 0:

            if total_female != 0:
                female_ratio = data['female'] / total_female
            else:
                female_ratio = 0

            if total_male != 0:
                male_ratio = data['male'] / total_male
            else:
                male_ratio = 0

            name_ratio_dict[(year, name)] = {'female': female_ratio, 'male': male_ratio}

    return name_ratio_dict


def calculate_difference(name_ratio_dict):
    difference_dict = {}

    for (year, name), data in name_ratio_dict.items():
        if data['female'] != 0 and data['male'] != 0:
            female_difference = abs(data['female'] - data['male'])
            difference_dict[(year, name)] = female_difference
    for key, value in difference_dict.items():
        if str(key[1]).startswith(letter1) or str(key[1]).startswith(letter2):
            break
    else:
        print('No name was given as both female and male names.')
        sys.exit()
    return difference_dict


def find_min_differences_by_year(difference_dict):
    min_differences_by_year = defaultdict(list)

    for (year, name), difference in difference_dict.items():
        if year not in min_differences_by_year or difference < min_differences_by_year[year][0][1]:
            min_differences_by_year[year] = [[name, difference]]
        elif difference == min_differences_by_year[year][0][1]:
            min_differences_by_year[year].append([name, difference])

    return min_differences_by_year

name_ratio_dict = statistic_year_percentage()
difference_dict = calculate_difference(name_ratio_dict)
min_differences_by_year = find_min_differences_by_year(difference_dict)

min_difference = min(min_differences_by_year.values(), key=lambda x: x[0][1])[0][1]

best_year=[]
best_name=[]

for year, data_list in min_differences_by_year.items():

    for name,difference in data_list:
        if difference == min_difference:
            best_year.append(year)
            best_name.append(name)
        elif difference<min_difference:
            min_difference=difference
            best_year = [year]  # 重置年份列表为当前年份
            best_name = [name] # 重置年份列表为当前名字
        else:
            continue

print('Here are the names that were given as both')
print('female and male names, for the smallest difference')
print('of ratio as a female name over all female names')
print('and ratio as a male name over all male names,')
print('for the years when that happened:')
for (year, name) in sorted(name_ratio_dict.keys(),key=lambda x:(x[1],x[0])):
    if name in best_name and year in best_year:

        percentage_female = name_ratio_dict[(year, name)]['female']
        percentage_male = name_ratio_dict[(year, name)]['male']

        # 使用字符串格式化将其显示为百分数
        percentage_str_female = f'{percentage_female:.5%}'
        percentage_str_male = f'{percentage_male:.5%}'

        print(f'  {name} in {year}, for ratios of')
        print(f'    - {percentage_str_female} as a female name,')
        print(f'    - {percentage_str_male} as a male name.')

# print("The time difference is :", timeit.default_timer() - starttime)