"""
list of all dependenices used.
"""importre,urllib.requestimporturllib,reimporturllibimportjsonimportnltkfromgeotextimportGeoTextfrombs4importBeautifulSoupfromcollectionsimportCounterimportargparseimportioimportjsonimportosimportmatplotlib.pyplotasplt;plt.rcdefaults()importnumpyasnpimportmatplotlib.pyplotaspltfromgoogle.cloudimportlanguageimportnumpyimportsiximportnumpyasnpnltk.download('punkt')nltk.download('averaged_perceptron_tagger')
"""
Function: returns list of urls...
Module Name: get_santaized_results_from_duckduckgo
Return: list of all urls
Description: module takes a query word as an input and then constructs the url (http://duckduckgo.com/html/?q=?). The url
then is scraped using beautilfulSoup library and all the href link are then collected in the list.
The list is then trimmed and unquoted to get a set of list items. Overall the process is tha simple.
"""defget_santaized_results_from_duckduckgo(query_word):url_builder=('http://duckduckgo.com/html/?q=%s'%query_word)site=urllib.request.urlopen(url_builder)data=site.read()soup=BeautifulSoup(data,"html.parser")url_list=[]forlinkinsoup.find_all('a'):curr_string=str(link.get('href'))start_pos=curr_string.find('http')ifstart_pos!=-1:curr_string=curr_string[start_pos:]url=urllib.parse.unquote(urllib.parse.unquote(curr_string))url_list.append(url)url_set=sorted(set(url_list),key=url_list.index)#url_set = set(url_list)
print(url_set)returnurl_set
"""
Function: get all words...
Module Name: get_token_from_site
Return: All the words.
Description: The site gets the url and then decodes to the utf-8.
After that parser is used to generate the html text alone and nltk is used to
generate the word tokenize.
"""defget_token_from_site(url):site=urllib.request.urlopen(url)html=site.read().decode('utf-8')### download the html content
raw=BeautifulSoup(html,"html.parser").get_text()#tokens = nltk.word_tokenize(raw)
tokens=nltk.wordpunct_tokenize(raw)text=nltk.Text(tokens)#Tokenize the text.
words=[w.lower()forwintext]vocab=sorted(set(words))#Normalize the words and separate out the
print(vocab)returnvocab
"""
Function: return all noun words...
Module Name: get_noun_from_site
Return: All nouns words.
Description: The site gets the url and then decodes to the utf-8.
After that parser is used to generate the html text alone and nltk is used to
generate the word tokenize.
"""defget_noun_from_site(url):site=urllib.request.urlopen(url)html=site.read().decode('utf-8')### download the html content
raw=BeautifulSoup(html,"html.parser").get_text()#tokens = nltk.word_tokenize(raw)
tokens=nltk.wordpunct_tokenize(raw)text=nltk.Text(tokens)#Tokenize the text.
nouns=[]forword,posinnltk.pos_tag(text):ifposin['NN',"NNP"]:# feel free to add any other noun tags
nouns.append(word)#words = [w.lower() for w in text]
#vocab = sorted(set(nouns))
#counts = Counter(nouns)
#Normalize the words and separate out the
returnnounsdefget_noun_from_site_counter(url):nouns=get_noun_from_site(url)counts=Counter(nouns)returncountsdata=get_noun_from_site_counter("https://en.wikipedia.org/wiki/Nivin_Pauly")print(data)
"""
"""defget_places_list_from_site(url):site=urllib.request.urlopen(url)html=site.read().decode('utf-8')### download the html content
#raw = nltk.clean_html(html)
raw=BeautifulSoup(html,"html.parser").get_text()#tokens = nltk.word_tokenize(raw)
tokens=nltk.wordpunct_tokenize(raw)wordlist=nltk.Text(tokens)#print(wordlist)
placesr=[]cities=[]fortextinwordlist:places=GeoText(text)iflen(places.countries)>0:#print(places.countries)
placesr+=places.countriesiflen(places.cities)>0:#print(places.cities)
cities+=places.cities#if len(places.)
return(placesr,cities)
defclassify(text,verbose=True):"""Classify the input text into categories. """language_client=language.LanguageServiceClient()document=language.types.Document(content=text,type=language.enums.Document.Type.PLAIN_TEXT)response=language_client.classify_text(document)categories=response.categoriesresult={}forcategoryincategories:# Turn the categories into a dictionary of the form:
# {category.name: category.confidence}, so that they can
# be treated as a sparse vector.
result[category.name]=category.confidenceifverbose:print(text)forcategoryincategories:print(u'='*20)print(u'{:<16}: {}'.format('category',category.name))print(u'{:<16}: {}'.format('confidence',category.confidence))returnresult#U0uTkkAWyI78keQMhGBf0lSC
query_word="Nivin"lis_data=get_santaized_results_from_duckduckgo(query_word)sz=len(lis_data)ma=min(10,len(lis_data))j=i=0accum=[]placesacc=[]mobilesacc=[]whilei<maandj<sz:try:each=lis_data[j]print("url",each)anes=classify(get_normalized_sentence(each))data=get_noun_from_site(each)iflen(data)>0:accum+=dataplaces=get_places_list_from_site(each)iflen(places)>0:placesacc+=places[0]placesacc+=places[1]box=parse_mobile_n_email_address(each)iflen(box)>0:mobilesacc+=boxi+=1except:print("Authentication Required...")j+=1#print(accum, placesacc, (mobilesacc))
placesacc=set(placesacc)print("#### Places:",placesacc)print("### Mobile & Email Address",mobilesacc)res=Counter(accum)normalized_nouns=[]frequeny_vector=[]all_nouns=[]sume=0forkinres:iflen(k)>3:normalized_nouns.append((k,res[k]))frequeny_vector.append(res[k])all_nouns.append(k)sume+=res[k]#print(normalized_nouns)
"""
Scoring metric
VectorA dot VectorB dot VectorC = [Frequency Vector] dot [Confidence Vector] dot [Damage or Exploit Vector]
for simplicity, the confidence vector and damage vector are user fed.
"""sz=len(normalized_nouns)confidence_vector=np.zeros(sz)exploit_vector=np.zeros(sz)user_defined_scored=np.zeros(sz)score=np.zeros(sz)y_pos=np.arange(len(all_nouns))performance=[10,8,6,4,2,1]plt.bar(y_pos,frequeny_vector,align='center',alpha=0.5)plt.xticks(y_pos,all_nouns)plt.ylabel('Frequency Count')plt.title('Frequencey of Nouns')plt.show()foriinrange(sz):confidence_vector[i]=0.5frequeny_vector[i]/=sumeexploit_vector[i]=0.2user_defined_scored[i]=exploit_vector[i]*confidence_vector[i]score=(np.dot(confidence_vector,user_defined_scored)/sz)*4.0print("Profile Tester Score is ",score)