Basic Principle of Programming
一门关于Python编程的课程

quiz_5_this_year

quiz_5.pdf
names.zip

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
# Written by *** for COMP9021
#
# Prompts the user for two capitalised strings of letters,
# say s1 and s2, then for two years in the range 1947--2021,
# say Y1 and Y2, with s1 and s2 and with Y1 and Y2 being separated
# by at least one space, and with possibly any extra spaces
# beween s1 and s2, between Y1 and Y2, at the start of either input,
# and at the end of either input.
# - s1 can be lexicographically smaller than or equal to s2
# or the other way around;
# - Y1 can be smaller than or equal to Y2 or the other way around.
#
# Outputs an error message if input is incorrect.
# Otherwise, finds out amongst the first name that starts with s1
# and the last name that starts with s2, which name has been given
# as both a female name and a male name in a year between Y1 and Y2
# included. If there is such a name and year, then outputs all such
# names and years for which the absolute value of the difference
# between
# - the ratio defined as the count of the name as a female name
# in that year over the count of all female names in that year,
# - the ratio defined as the count of the name as a male name
# in that year over the count of all male names in that year,
# is minimal (so essentially, the popularities in that year
# of the name as a female name and of the name as a male name
# are as close as possible).
# Outputs the name, the year, and both ratios as percentages
# printed out with 5 digits after the decimal point.
# In case there are many solutions (that is, same minimal
# difference in popularities), then outputs all solutions
# in increasing lexicographic order of names, and for
# a given name, in increasing lexicographic order of years.
#
# The directory named names is stored in the working directory.
#
# IF YOU USE ABSOLUTE PATHS, YOUR PROGRAM CAN ONLY FAIL TO RUN PROPERLY
# ON MY MACHINE AND YOU WILL SCORE 0 TO THE QUIZ, WITH NO CHANCE FOR YOU
# TO FIX THIS MISTAKE AFTER RESULTS HAVE BEEN RELEASED.
#
# YOU CANNOT USE pandas FOR THIS QUIZ; IF YOU DO, YOU WILL SCORE 0
# TO THE QUIZ.
import os
import sys
import timeit
from pathlib import Path
import csv
from collections import defaultdict

# INSERT YOUR CODE HERE
try:
letters = input('Enter two capitalised strings of letters: ').split()
if len(letters) != 2:
raise ValueError
letter1 = letters[0]
letter2 = letters[1]

if len(str(letter1)) > 1:
if not (letter1[0].isupper() and letter1[1:].islower()):
raise ValueError
if len(str(letter2)) > 1:
if not (letter2[0].isupper() and letter2[1:].islower()):
raise ValueError
if letter1.isdigit() or letter2.isdigit():
raise ValueError
if not letter1 or not letter2 or letter1.islower() or letter2.islower():
raise ValueError
except ValueError:
print('Incorrect input, leaving it there.')
sys.exit()

try:
integers = input('Enter two integers between 1947 and 2021: ').split()
integers = list(integers)
# print(integers)
if not integers or len(integers) != 2:
raise ValueError
integer1 = integers[0]
integer2 = integers[1]
# print(integer1, integer2)
if int(integer1) < 1947 or int(integer1) > 2021 or int(integer2) < 1947 or int(integer2) > 2021 or len(
integer1) != 4 or len(integer2) != 4:
raise ValueError
except ValueError:
print('Incorrect input, leaving it there.')
sys.exit()

# 統計當前年份下,某性別某個名字人數(之和)占某個性別總人數的比率

isAppear = False
upper_bound = int(max(integer1, integer2))
lower_bound = int(min(integer1, integer2))

names_dir = Path('names')

if letter1 > letter2:
letter1, letter2 = letter2, letter1


# 統計某一年下男生女生的各自數量
def statistic_amount():
gender_per_year = {} # 外部字典
for filename in sorted(names_dir.glob('*.txt')):
if int(filename.name[3:7]) <= upper_bound and int(filename.name[3:7]) >= lower_bound:
year = int(filename.name[3: 7])
with open(filename, 'r', newline='') as file:
csv_file = csv.reader(file)
for row in csv_file:
if len(row) != 3:
# 处理不匹配的行,可以打印警告或执行其他操作
continue
name, gender, count = row
if count.strip():
count = int(count)

if year not in gender_per_year:
gender_per_year[year] = {} # 内部字典
if gender not in gender_per_year[year]:
gender_per_year[year][gender] = 0
gender_per_year[year][gender] += count
return gender_per_year

def load_data(lower_bound, upper_bound, letter2):
name_list = []
global isAppear
isAppear = False

for filename in names_dir.glob('*.txt'):
year = int(filename.name[3:7])

if lower_bound <= year <= upper_bound:
with open(filename, 'r', newline='') as file:
csv_file = csv.reader(file)
for row in csv_file:
if len(row) != 3:
continue
name, gender, count = row
name_list.append([name, gender, count, year])
if name.startswith(letter2):
isAppear = True

return name_list, isAppear

def sort_name_list(name_list, letter2, isAppear):
not_startswith_letter2 = []
remaining_items = []
insert_index = 0

for i, item in enumerate(name_list):
if item[0].startswith(letter2):
remaining_items.append(item)
else:
not_startswith_letter2.append(item)
# print(remaining_items)
if isAppear:
# 对不以 letter2 开头的名字按字母表顺序排序
not_startswith_letter2.sort(key=lambda x: x[0])

# 寻找插入点
for i, item in enumerate(not_startswith_letter2):
if item[0] < letter2: # 寻找插入点
insert_index = i
# 插入剩余的名字
name_list = not_startswith_letter2[:insert_index] + remaining_items
else:
name_list = sorted(name_list, key=lambda x: x[0])

return name_list

def statistic_all_year_name():
name_list, isAppear = load_data(lower_bound, upper_bound, letter2)
name_list = sort_name_list(name_list, letter2, isAppear)
return name_list



def statistic_year_name():
start_processing = False
find_over_name = False
name_dict = defaultdict(lambda: {'female': 0, 'male': 0})
signal_over_name = None

name_list = statistic_all_year_name()
for item in name_list:
name, gender, count, year = item

if str(name).startswith(letter1) and start_processing == False:
start_processing = True
if start_processing == False:
continue

if isAppear == False:
end_guard = chr(ord(letter2[0]) + 1)
if str(name).startswith(end_guard):
signal_over_name = str(name)
find_over_name = True


elif str(name).startswith(letter2) and find_over_name == False:

signal_over_name = str(name)
find_over_name = True
gender_mark = gender

if count.strip(): # 检查字符串是否为空或只包含空格
count = int(count)
if gender == 'F':
name_dict[(year, name)]['female'] += count
elif gender == 'M':
name_dict[(year, name)]['male'] += count

if isAppear == True:
if str(name) == signal_over_name and gender != gender_mark:
break
else:
if str(name) == signal_over_name:
break
return name_dict


def statistic_year_percentage():
gender_per_year = statistic_amount()
name_dict = statistic_year_name()
name_ratio_dict = {}
for (year, name), data in name_dict.items():
total_female = gender_per_year[year].get('F', 0)
total_male = gender_per_year[year].get('M', 0)

if total_female > 0 or total_male > 0:

if total_female != 0:
female_ratio = data['female'] / total_female
else:
female_ratio = 0

if total_male != 0:
male_ratio = data['male'] / total_male
else:
male_ratio = 0

name_ratio_dict[(year, name)] = {'female': female_ratio, 'male': male_ratio}

return name_ratio_dict


def calculate_difference(name_ratio_dict):
difference_dict = {}

for (year, name), data in name_ratio_dict.items():
if data['female'] != 0 and data['male'] != 0:
female_difference = abs(data['female'] - data['male'])
difference_dict[(year, name)] = female_difference
for key, value in difference_dict.items():
if str(key[1]).startswith(letter1) or str(key[1]).startswith(letter2):
break
else:
print('No name was given as both female and male names.')
sys.exit()
return difference_dict


def find_min_differences_by_year(difference_dict):
min_differences_by_year = defaultdict(list)

for (year, name), difference in difference_dict.items():
if year not in min_differences_by_year or difference < min_differences_by_year[year][0][1]:
min_differences_by_year[year] = [[name, difference]]
elif difference == min_differences_by_year[year][0][1]:
min_differences_by_year[year].append([name, difference])

return min_differences_by_year

name_ratio_dict = statistic_year_percentage()
difference_dict = calculate_difference(name_ratio_dict)
min_differences_by_year = find_min_differences_by_year(difference_dict)

min_difference = min(min_differences_by_year.values(), key=lambda x: x[0][1])[0][1]

best_year=[]
best_name=[]

for year, data_list in min_differences_by_year.items():

for name,difference in data_list:
if difference == min_difference:
best_year.append(year)
best_name.append(name)
elif difference<min_difference:
min_difference=difference
best_year = [year] # 重置年份列表为当前年份
best_name = [name] # 重置年份列表为当前名字
else:
continue

print('Here are the names that were given as both')
print('female and male names, for the smallest difference')
print('of ratio as a female name over all female names')
print('and ratio as a male name over all male names,')
print('for the years when that happened:')
for (year, name) in sorted(name_ratio_dict.keys(),key=lambda x:(x[1],x[0])):
if name in best_name and year in best_year:

percentage_female = name_ratio_dict[(year, name)]['female']
percentage_male = name_ratio_dict[(year, name)]['male']

# 使用字符串格式化将其显示为百分数
percentage_str_female = f'{percentage_female:.5%}'
percentage_str_male = f'{percentage_male:.5%}'

print(f' {name} in {year}, for ratios of')
print(f' - {percentage_str_female} as a female name,')
print(f' - {percentage_str_male} as a male name.')

# print("The time difference is :", timeit.default_timer() - starttime)