How to find intersection of two sentences and include substrings
Question:
I am comparing sentences with jaccard similarity in Python.
However, I have a question for the intersection function:
import itertools
import pandas as pd
item1='She went to a restaurant on Oxford Street'.split(' ')
item2='She went to an Italian restaurant on Oxf. Street'.split(' ')
set.intersection(*[set(item1), set(item2)])
Result :
{'She', 'Street', 'on', 'restaurant', 'to', 'went'}
It only calculates intersection for strings that are completely identical, is there a way to include the word Oxf also, since it is short for Oxford? I.e., if a substring exists in the other set it should be included in the intersection.
Answers:
#!/usr/bin/env python
from pprint import pprint
def jaccard_similarity(str1, str2,
exclude_words=set(["a", "an", "at", "in", "on", "is", "of", "with", "from", "to", "went"])):
# Remove all periods "." to reduce later computation (num alias comparisons).
x,y = [set(v.replace(".", "").split(" ")) - exclude_words for v in (str1, str2)]
all_words = x | y
word_aliases = dict()
min_alias_length = 3
for word in all_words:
aliases = set([word[:i] for i in range(min_alias_length, len(word))])
if aliases:
word_aliases[word] = aliases
print("Word aliases:")
pprint(word_aliases)
insert_aliases_for_words = {}
for word, aliases in word_aliases.items():
for words in (x, y):
aliases_in_words = aliases & words
if aliases_in_words:
# Remove all aliases and replace with single "original" word.
words -= aliases_in_words
words.add(word)
print("Unaliased word sets:")
print(x)
print(y)
intersection = x & y
print("Intersection:")
print(intersection)
intersection_cardinality = len(intersection)
union_cardinality = len(x|y)
return intersection_cardinality/float(union_cardinality)
import itertools
item1 = 'She went to a restaurant on Oxford Street'
item2 = 'She went to an Italian restaurant on Oxf. Street'
result = jaccard_similarity(item1, item2)
print(result)
Output:
Word aliases:
{'Italian': {'Italia', 'Ital', 'Itali', 'Ita'},
'Oxford': {'Oxfo', 'Oxf', 'Oxfor'},
'Street': {'Stree', 'Stre', 'Str'},
'restaurant': {'res',
'rest',
'resta',
'restau',
'restaur',
'restaura',
'restauran'}}
Unaliased word sets:
{'Oxford', 'Street', 'restaurant', 'She'}
{'She', 'restaurant', 'Oxford', 'Italian', 'Street'}
Intersection:
{'Oxford', 'She', 'restaurant', 'Street'}
0.8
Here is a solution is you want your intersection to gather all words if one begin with the other (which will create problems in your case with words that are not really acronyms). It might be useful to someone
[x for x in set(item1) if any(x.startswith(u) or u.startswith(x) for u in set(item2))]
I am comparing sentences with jaccard similarity in Python.
However, I have a question for the intersection function:
import itertools
import pandas as pd
item1='She went to a restaurant on Oxford Street'.split(' ')
item2='She went to an Italian restaurant on Oxf. Street'.split(' ')
set.intersection(*[set(item1), set(item2)])
Result :
{'She', 'Street', 'on', 'restaurant', 'to', 'went'}
It only calculates intersection for strings that are completely identical, is there a way to include the word Oxf also, since it is short for Oxford? I.e., if a substring exists in the other set it should be included in the intersection.
#!/usr/bin/env python
from pprint import pprint
def jaccard_similarity(str1, str2,
exclude_words=set(["a", "an", "at", "in", "on", "is", "of", "with", "from", "to", "went"])):
# Remove all periods "." to reduce later computation (num alias comparisons).
x,y = [set(v.replace(".", "").split(" ")) - exclude_words for v in (str1, str2)]
all_words = x | y
word_aliases = dict()
min_alias_length = 3
for word in all_words:
aliases = set([word[:i] for i in range(min_alias_length, len(word))])
if aliases:
word_aliases[word] = aliases
print("Word aliases:")
pprint(word_aliases)
insert_aliases_for_words = {}
for word, aliases in word_aliases.items():
for words in (x, y):
aliases_in_words = aliases & words
if aliases_in_words:
# Remove all aliases and replace with single "original" word.
words -= aliases_in_words
words.add(word)
print("Unaliased word sets:")
print(x)
print(y)
intersection = x & y
print("Intersection:")
print(intersection)
intersection_cardinality = len(intersection)
union_cardinality = len(x|y)
return intersection_cardinality/float(union_cardinality)
import itertools
item1 = 'She went to a restaurant on Oxford Street'
item2 = 'She went to an Italian restaurant on Oxf. Street'
result = jaccard_similarity(item1, item2)
print(result)
Output:
Word aliases:
{'Italian': {'Italia', 'Ital', 'Itali', 'Ita'},
'Oxford': {'Oxfo', 'Oxf', 'Oxfor'},
'Street': {'Stree', 'Stre', 'Str'},
'restaurant': {'res',
'rest',
'resta',
'restau',
'restaur',
'restaura',
'restauran'}}
Unaliased word sets:
{'Oxford', 'Street', 'restaurant', 'She'}
{'She', 'restaurant', 'Oxford', 'Italian', 'Street'}
Intersection:
{'Oxford', 'She', 'restaurant', 'Street'}
0.8
Here is a solution is you want your intersection to gather all words if one begin with the other (which will create problems in your case with words that are not really acronyms). It might be useful to someone
[x for x in set(item1) if any(x.startswith(u) or u.startswith(x) for u in set(item2))]