In natural language processing, it is important to identify \"phrases\" from tex

ID: 3739592 • Letter: I

Question

In natural language processing, it is important to identify "phrases" from text. By considering phrases as word sequences of fixed order that are frequent in the corpus, one can apply the sequential pattern mining algorithm GSP, Prefix-Span or SPADE to solve the problem.

In this assignment, you will be given raw text sentences and you need to implement GSP or other equivalent sequential pattern mining algorithm to find frequent phrases from the text.

You can assume that the input is in English only. As the first step, please remove less important words (stop words) from the sentences. You can use the english stopwords list from below.

Then, convert all words to lower case to remove ambiguity. Now, start building the sequence database. Specifically, process conjunction joiner ("and") by grouping joined words into itemsets. That is, for example, make "A, B and C" into (A, B, C) in sequence transaction.

Input Format

The input dataset is raw text sentences.

The first line of the input corresponds to the minimum support.

Each following line of the input corresponds to one sentence. Words in each transaction are seperated by a space.

Please refer to the sample input below. In sample input 0, the minimum support is 3, and the dataset contains 3 sentences and 5 words (a, b, c, d, e, f and g).

Constraints

Output Format

The output are the longest length frequent phrases you mined out from the input dataset. This means that you have to report the phrase with the longest length which satisfies the minimum support criterion.

Each line of the output should be in the format:

The frequent phrases should be ordered according to their support from largest to smallest. Ties should be resolved by ordering the frequent phrases according to the alphabetical order.

Please refer to the sample output below. In sample output 1, the four phrases are the sequential frequent phrases calcualted after converting b and c into (bc).

Sample Input 0

Sample Output 0

Sample Input 1

Sample Output 1

Explanation / Answer

import sys
import itertools
my_list = []

for line in sys.stdin:

data=line.lower().strip().replace(' a ',' ').replace(' an ',' ').replace(' are ',' ').replace(' as ',' ').replace(' at ',' ').replace(' by ',' ').replace(' be ',' ').replace(' for ',' ').replace(' from ',' ').replace(' has ',' ').replace(' he ',' ').replace(' in ',' ').replace(' is ',' ').replace(' it ',' ').replace(' its ',' ').replace(' of ',' ').replace(' on ',' ').replace(' that ',' ').replace(' the ',' ').replace('the ','').replace(' to ',' ').replace(' was ', '').replace(' were ',' ').replace(' will ',' ').replace(' with ',' ').replace('.','').replace(', and ','$').replace(', ','$').replace(' and ','$').split(' ')
my_list.append(data)

min_support = int(my_list[0][0])
del my_list[0]

my_list1=[]
for i in my_list:
    temp_list=[]
    for j in i:
        temp_list.append(j.split('$'))
    my_list1.append(temp_list)

my_list=my_list1[:]
del my_list1
del temp_list

database={}
for sid in range(len(my_list)):
for eid in range(len(my_list[sid])):

        if sid in database:
            database[sid][eid]=sorted([item for item in my_list[sid][eid]])
        else:
            database[sid]={eid:sorted([item for item in my_list[sid][eid]])}

#single frequent items
items=[]
for i in range(len(my_list)):
    for j in range(len(my_list[i])):
        for k in range(len(my_list[i][j])):
            items.append(my_list[i][j][k])

items=sorted(list(set(items)))

item_count={}

for key in items:
    for i in range(len(my_list)):
        for j in range(len(my_list[i])):

            if key in my_list[i][j]:
                if key in item_count:
                    item_count[key]+=1
                    break
                else:
                    item_count[key]=1
                    break

shortlisted_items=[]
useless_items=[]

for key in item_count:
    if item_count[key] >= min_support:
        shortlisted_items.append(key)
    else:
        useless_items.append(key)

shortlisted_items.sort()
for item in useless_items:
    del item_count[item]
    items.remove(item)
    for sid in database:
        for eid in database[sid].keys():
            if item in database[sid][eid]:
                database[sid][eid].remove(item)

def frequent_single_items(min_support):
    singles_list=[]
    for perms in singles:
        for i in range(len(my_list)):
            for j in range(len(my_list[i])):
                check=True
                for things in perms:
                    if things not in my_list[i][j]:
                        check = False
                        break
                if check==True:
                    singles_list.append(perms)
                    break
            #if check==True:
                #break

    singles_count=[]
    for i in singles:
        if i in singles_list:
            singles_count.append((i,singles_list.count(i)))

    shortlisted_single_items=[]
    shortlisted_single_items_count=[]
    for i in singles_count:
        if i[1]>=min_support:
            shortlisted_single_items.append(i[0])
            shortlisted_single_items_count.append(i[1])
    return shortlisted_single_items,shortlisted_single_items_count

def frequent_multiple_items(min_support):
    multiples_list=[]
    for sequence in my_list:
        final_temp_list=[]
        for i in range(len(multiples)):

            temp_list=[]
            temp_list1=[]
            event=0
            for j in range(len(multiples[i])):

check=True
variable_so_that_item_without_while_is_not_added=0

                while event<len(sequence):
                    if set(multiples[i][j])<=set(sequence[event]):
                        check=True
                        variable_so_that_item_without_while_is_not_added=1
                        break
                    else:
                        check=False
                        event+=1
                event+=1
                if check==True and variable_so_that_item_without_while_is_not_added==1:
                    temp_list.append(event)
                    temp_list1.append(multiples[i][j])
                else:
                    break
            if temp_list1==multiples[i]:
                final_temp_list.append(temp_list1)
        multiples_list.extend(final_temp_list)

    multiples_count=[]
    for i in multiples:
        if i in multiples_list:
            multiples_count.append((i,multiples_list.count(i)))

    shortlisted_multiple_items=[]
    shortlisted_multiple_items_count=[]
    for i in multiples_count:
        if i[1]>=min_support:
            shortlisted_multiple_items.append(i[0])
            shortlisted_multiple_items_count.append(i[1])
    return shortlisted_multiple_items,shortlisted_multiple_items_count

singles=list(itertools.combinations(items,2))
multiples=list(itertools.product([[i] for i in items],repeat=2))

for i in range(len(singles)):
singles[i]=list(singles[i])
for i in range(len(multiples)):
multiples[i]=list(multiples[i])

a,a_count= frequent_single_items(min_support)
b,b_count=frequent_multiple_items(min_support)
#print(a,a_count)
#print(b, b_count)
count=3
while a or b:
    if len(a)==0:
        final_frequent_items=b[:]
        final_frequent_count=b_count[:]
    elif len(b)==0:
        final_frequent_items=a[:]
        final_frequent_count=a_count[:]
    singles=[]
    multiples=[]
    if len(a)==0 and len(b)!=0:

        for i in b:
            for j in b:
                if i[1:]==j[:-1]:

                    multiples.append(list(i[0:1]+i[1:]+j[-1:]))
    elif len(a)!=0 and len(b)!=0:

        for i in b:
            for j in b:
                if i[1:]==j[:-1]:
                    multiples.append(list(i[0:1]+i[1:]+j[-1:]))
        for i in b:
            for j in a:
                if list(i[0:1]+[j]) not in multiples:
                    multiples.append(list(i[0:1]+[j]))
                if list([j]+i[1:]) not in multiples:
                    multiples.append(list([j]+i[1:]))
        singles=list(itertools.combinations(items,count))
        for i in range(len(singles)):
            singles[i]=list(singles[i])
        count+=1
        a,a_count= frequent_single_items(min_support)
    elif len(a)!=0 and len(b)==0:

        singles=list(itertools.combinations(items,count))
        for i in range(len(singles)):
            singles[i]=list(singles[i])
        count+=1
        a,a_count= frequent_single_items(min_support)
    b,b_count=frequent_multiple_items(min_support)

#print(b,b_count)

for i, j in zip(final_frequent_count,final_frequent_items):
    k=''
    for m in j:

        k+=str(m[0]) + ' '
    k=k.strip()
print(i, '['+k+']')

Navigate

In narrative format discuss the key facts and critical issues of the life, crime

In natural uranium, 0.720% of the atoms are the isotope 235-Uranium, 0.0055% are

Integrity-first tutoring: explanations and feedback only — we do not complete graded work. Learn more.

In natural language processing, it is important to identify \"phrases\" from tex

Question

Explanation / Answer

Related Questions

Navigate