I\'m new to python language and having trouble to write the following method for

ID: 3663112 • Letter: I

Question

I'm new to python language and having trouble to write the following method for a decision tree. Any help would be appreciate.

def counts_to_entropy(counts):
entropy = 0.0
# TODO: should convert a dictionary of counts into entropy
return entropy

Here are the given two classes for this assignment:

//////////////////////// Tree class starts ////////////////////////////////

from math import log

class Tree:
leaf = True
prediction = None
feature = None
threshold = None
left = None
right = None

def predict(tree, point):
if tree.leaf:
return tree.prediction
i = tree.feature
if (point.values[i] < tree.threshold):
return predict(tree.left, point)
else:
return predict(tree.right, point)

def most_likely_class(prediction):
labels = list(prediction.keys())
probs = list(prediction.values())
return labels[probs.index(max(probs))]

def accuracy(data, predictions):
total = 0
correct = 0
for i in range(len(data)):
point = data[i]
pred = predictions[i]
total += 1
guess = most_likely_class(pred)
if guess == point.label:
correct += 1
return float(correct) / total

def split_data(data, feature, threshold):
left = []
right = []
# TODO: split data into left and right by given feature.
# left should contain points whose values are less than threshold
# right should contain points with values greater than or equal to threshold
return (left, right)

def count_labels(data):
counts = {}
# TODO: counts should count the labels in data
# e.g. counts = {'spam': 10, 'ham': 4}
return counts

def counts_to_entropy(counts):
entropy = 0.0
# TODO: should convert a dictionary of counts into entropy
return entropy

def get_entropy(data):
counts = count_labels(data)
entropy = counts_to_entropy(counts)
return entropy

# This is a correct but inefficient way to find the best threshold to maximize
# information gain.
def find_best_threshold(data, feature):
entropy = get_entropy(data)
best_gain = 0
best_threshold = None
for point in data:
left, right = split_data(data, feature, point.values[feature])
curr = (get_entropy(left)*len(left) + get_entropy(right)*len(right))/len(data)
gain = entropy - curr
if gain > best_gain:
best_gain = gain
best_threshold = point.values[feature]
return (best_gain, best_threshold)

def find_best_threshold_fast(data, feature):
entropy = get_entropy(data)
best_gain = 0
best_threshold = None
# TODO: Write a more efficient method to find the best threshold.
return (best_gain, best_threshold)

def find_best_split(data):
if len(data) < 2:
return None, None
best_feature = None
best_threshold = None
best_gain = 0
# TODO: find the feature and threshold that maximize information gain.
return (best_feature, best_threshold)

def make_leaf(data):
tree = Tree()
counts = count_labels(data)
prediction = {}
for label in counts:
prediction[label] = float(counts[label])/len(data)
tree.prediction = prediction
return tree

def c45(data, max_levels):
if max_levels <= 0:
return make_leaf(data)
# TODO: Construct a decision tree with the data and return it.
# Your algorithm should return a leaf if the maximum level depth is reached
# or if there is no split that gains information, otherwise it should greedily
# choose an feature and threshold to split on and recurse on both partitions
# of the data.
return make_leaf(data)

def submission(train, test):
# TODO: Once your tests pass, make your submission as good as you can!
tree = c45(train, 4)
predictions = []
for point in test:
predictions.append(predict(tree, point))
return predictions

# This might be useful for debugging.
def print_tree(tree):
if tree.leaf:
print "Leaf", tree.prediction
else:
print "Branch", tree.feature, tree.threshold
print_tree(tree.left)
print_tree(tree.right)

//////////////////////////////////// Tree class ends //////////////////////////////

////////////////////////// Point class begins //////////////////////////

class Point:
def __str__(self):
return "<" + self.label + ": " + `self.values` + ">"
def __repr__(self):
return "<" + self.label + ": " + `self.values` + ">"
def __init__(self, label, values):
self.label = label
self.values = values

def get_label(s, labels):
for l in labels:
if l in s:
return l
raise Exception('Label not found', s)

def string_statistics(s):
s = s.lower()
values = [0]*32
total = len(s)
for c in s:
n = ord(c)
if 97 <= n <= 122:
values[n-97] += 1
elif c == '.':
values[26] += 1
elif c == ',':
values[27] += 1
elif c == '?':
values[28] += 1
elif c == '!':
values[29] += 1
elif c in '0123456789':
values[30] += 1
else:
values[31] += 1
for i in range(len(values)):
values[i] /= float(total)
return values

def get_data(filename, labels):
f = open(filename)
data = []
for line in f:
line = line.strip()
label = get_label(line.split("."), labels)
s = open(line).read()
values = string_statistics(s)
data.append(Point(label, values))
return data

def get_spam_train_data():
labels = ("ham", "spam")
return get_data("spam/train.list", labels)

def get_spam_valid_data():
labels = ("ham", "spam")
return get_data("spam/valid.list", labels)

def get_college_data():
data = [
Point('College', [24, 40000]),
Point('No College', [53, 52000]),
Point('No College', [23, 25000]),
Point('College', [25, 77000]),
Point('College', [32, 48000]),
Point('College', [52, 110000]),
Point('College', [22, 38000]),
Point('No College', [43, 44000]),
Point('No College', [52, 27000]),
Point('College', [48, 65000])
]
return data

Explanation / Answer

# Entropy is defined as a term in thermodynamics.
# Entropy will make a measure of a number of
# realizations.
# These realizations have to be very specific or very particular.
# Entropy can enable us to find a count of how many molecules
# are in a dis array inside a macroscopic arrangement

In the code given in the question:

following additions are needed:

# In the section of count_labels

def count_labels(data):
counts = {}

# e.g. counts = {'spam': 10, 'ham': 4}
# here counts is made into a dictionary

# def split_data(data, feature, threshold): definition of splitting the data

def split_data(data, feature, threshold):
    left = []
    right = []
# Split method of python String can be used for this
str1.split(str1=" ", numOfLines=string.count(str1))
# where str1 = the delimiter - we have used space, the default
# numOfLines = Number of Lines to be produced
# return value is going to be the collection of lines we had mentioned in the numOfLines parameter
# an example code snipet so that the same logic can be applied
# to the code in the question
#   #!/usr/bin/python
#
#   str1 = "The quick brown fox jumped over the fence";
#   print str1.split( )
#   print str1.split(' ', 1)   # single space delimiter
#   # output
#   ['The', 'quick', 'brown', ... etc up to , 'fence']

#    In order TO split the data into left and right by
#    using the given feature, use the command:
#   count = 1
#   str1 = data # remember data is the received parameter
#   left = str1.split(' ', count)
#   what is the use of threshold?
    # left should contain points whose values
# are less than threshold
    # right should contain points with values
# greater than or equal to threshold
    return (left, right)

# The same logic needs to be applied for the rest of the functions in order to split

# the dictionary into the Entropy

Navigate

I\'m new to python language and having trouble to write the following method for

Integrity-first tutoring: explanations and feedback only — we do not complete graded work. Learn more.

I\'m new to python language and having trouble to write the following method for

Question

Explanation / Answer

Related Questions

Navigate