I\'m new to python language and having trouble to write the following method for
ID: 3663112 • Letter: I
Question
I'm new to python language and having trouble to write the following method for a decision tree. Any help would be appreciate.
def counts_to_entropy(counts):
entropy = 0.0
# TODO: should convert a dictionary of counts into entropy
return entropy
Here are the given two classes for this assignment:
//////////////////////// Tree class starts ////////////////////////////////
from math import log
class Tree:
leaf = True
prediction = None
feature = None
threshold = None
left = None
right = None
def predict(tree, point):
if tree.leaf:
return tree.prediction
i = tree.feature
if (point.values[i] < tree.threshold):
return predict(tree.left, point)
else:
return predict(tree.right, point)
def most_likely_class(prediction):
labels = list(prediction.keys())
probs = list(prediction.values())
return labels[probs.index(max(probs))]
def accuracy(data, predictions):
total = 0
correct = 0
for i in range(len(data)):
point = data[i]
pred = predictions[i]
total += 1
guess = most_likely_class(pred)
if guess == point.label:
correct += 1
return float(correct) / total
def split_data(data, feature, threshold):
left = []
right = []
# TODO: split data into left and right by given feature.
# left should contain points whose values are less than threshold
# right should contain points with values greater than or equal to threshold
return (left, right)
def count_labels(data):
counts = {}
# TODO: counts should count the labels in data
# e.g. counts = {'spam': 10, 'ham': 4}
return counts
def counts_to_entropy(counts):
entropy = 0.0
# TODO: should convert a dictionary of counts into entropy
return entropy
def get_entropy(data):
counts = count_labels(data)
entropy = counts_to_entropy(counts)
return entropy
# This is a correct but inefficient way to find the best threshold to maximize
# information gain.
def find_best_threshold(data, feature):
entropy = get_entropy(data)
best_gain = 0
best_threshold = None
for point in data:
left, right = split_data(data, feature, point.values[feature])
curr = (get_entropy(left)*len(left) + get_entropy(right)*len(right))/len(data)
gain = entropy - curr
if gain > best_gain:
best_gain = gain
best_threshold = point.values[feature]
return (best_gain, best_threshold)
def find_best_threshold_fast(data, feature):
entropy = get_entropy(data)
best_gain = 0
best_threshold = None
# TODO: Write a more efficient method to find the best threshold.
return (best_gain, best_threshold)
def find_best_split(data):
if len(data) < 2:
return None, None
best_feature = None
best_threshold = None
best_gain = 0
# TODO: find the feature and threshold that maximize information gain.
return (best_feature, best_threshold)
def make_leaf(data):
tree = Tree()
counts = count_labels(data)
prediction = {}
for label in counts:
prediction[label] = float(counts[label])/len(data)
tree.prediction = prediction
return tree
def c45(data, max_levels):
if max_levels <= 0:
return make_leaf(data)
# TODO: Construct a decision tree with the data and return it.
# Your algorithm should return a leaf if the maximum level depth is reached
# or if there is no split that gains information, otherwise it should greedily
# choose an feature and threshold to split on and recurse on both partitions
# of the data.
return make_leaf(data)
def submission(train, test):
# TODO: Once your tests pass, make your submission as good as you can!
tree = c45(train, 4)
predictions = []
for point in test:
predictions.append(predict(tree, point))
return predictions
# This might be useful for debugging.
def print_tree(tree):
if tree.leaf:
print "Leaf", tree.prediction
else:
print "Branch", tree.feature, tree.threshold
print_tree(tree.left)
print_tree(tree.right)
//////////////////////////////////// Tree class ends //////////////////////////////
////////////////////////// Point class begins //////////////////////////
class Point:
def __str__(self):
return "<" + self.label + ": " + `self.values` + ">"
def __repr__(self):
return "<" + self.label + ": " + `self.values` + ">"
def __init__(self, label, values):
self.label = label
self.values = values
def get_label(s, labels):
for l in labels:
if l in s:
return l
raise Exception('Label not found', s)
def string_statistics(s):
s = s.lower()
values = [0]*32
total = len(s)
for c in s:
n = ord(c)
if 97 <= n <= 122:
values[n-97] += 1
elif c == '.':
values[26] += 1
elif c == ',':
values[27] += 1
elif c == '?':
values[28] += 1
elif c == '!':
values[29] += 1
elif c in '0123456789':
values[30] += 1
else:
values[31] += 1
for i in range(len(values)):
values[i] /= float(total)
return values
def get_data(filename, labels):
f = open(filename)
data = []
for line in f:
line = line.strip()
label = get_label(line.split("."), labels)
s = open(line).read()
values = string_statistics(s)
data.append(Point(label, values))
return data
def get_spam_train_data():
labels = ("ham", "spam")
return get_data("spam/train.list", labels)
def get_spam_valid_data():
labels = ("ham", "spam")
return get_data("spam/valid.list", labels)
def get_college_data():
data = [
Point('College', [24, 40000]),
Point('No College', [53, 52000]),
Point('No College', [23, 25000]),
Point('College', [25, 77000]),
Point('College', [32, 48000]),
Point('College', [52, 110000]),
Point('College', [22, 38000]),
Point('No College', [43, 44000]),
Point('No College', [52, 27000]),
Point('College', [48, 65000])
]
return data
Explanation / Answer
# Entropy is defined as a term in thermodynamics.
# Entropy will make a measure of a number of
# realizations.
# These realizations have to be very specific or very particular.
# Entropy can enable us to find a count of how many molecules
# are in a dis array inside a macroscopic arrangement
In the code given in the question:
following additions are needed:
# In the section of count_labels
def count_labels(data):
counts = {}
# e.g. counts = {'spam': 10, 'ham': 4}
# here counts is made into a dictionary
# def split_data(data, feature, threshold): definition of splitting the data
def split_data(data, feature, threshold):
left = []
right = []
# Split method of python String can be used for this
str1.split(str1=" ", numOfLines=string.count(str1))
# where str1 = the delimiter - we have used space, the default
# numOfLines = Number of Lines to be produced
# return value is going to be the collection of lines we had mentioned in the numOfLines parameter
# an example code snipet so that the same logic can be applied
# to the code in the question
# #!/usr/bin/python
#
# str1 = "The quick brown fox jumped over the fence";
# print str1.split( )
# print str1.split(' ', 1) # single space delimiter
# # output
# ['The', 'quick', 'brown', ... etc up to , 'fence']
# In order TO split the data into left and right by
# using the given feature, use the command:
# count = 1
# str1 = data # remember data is the received parameter
# left = str1.split(' ', count)
# what is the use of threshold?
# left should contain points whose values
# are less than threshold
# right should contain points with values
# greater than or equal to threshold
return (left, right)
# The same logic needs to be applied for the rest of the functions in order to split
# the dictionary into the Entropy
Related Questions
drjack9650@gmail.com
Navigate
Integrity-first tutoring: explanations and feedback only — we do not complete graded work. Learn more.