Cybersecurity Project

Online Profile Grader

Posted by Nivin Anton Alexis Lawrence on November 28, 2017

Recently by the same author:


Introduction to Functional Programming [Part 2]

Think Functional


Nivin Anton Alexis Lawrence

Human

You may find interesting:


DataScience Project I

Jypyer notebook Important Shortcut Keys


Datascience Project II

Text Analysis and Entity Resolution

Online Profile Tester

Cyber Security Project by - Nivin

"""
list of all dependenices used.
"""
import re, urllib.request
import urllib,re
import urllib
import json
import nltk
from geotext import GeoText
from bs4 import BeautifulSoup
from collections import Counter

import argparse
import io
import json
import os

import matplotlib.pyplot as plt; plt.rcdefaults()
import numpy as np
import matplotlib.pyplot as plt


from google.cloud import language
import numpy
import six
import numpy as np 

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
"""
Function: returns list of urls... 
Module Name: get_santaized_results_from_duckduckgo
Return: list of all urls
Description: module takes a query word as an input and then constructs the url (http://duckduckgo.com/html/?q=?). The url
then is scraped using beautilfulSoup library and all the href link are then collected in the list.
The list is then trimmed and unquoted to get a set of list items. Overall the process is tha simple.
"""

def get_santaized_results_from_duckduckgo(query_word):
    url_builder = ('http://duckduckgo.com/html/?q=%s' % query_word)    
    site = urllib.request.urlopen(url_builder)
    data = site.read()
    soup = BeautifulSoup(data, "html.parser")
    url_list = []
    for link in soup.find_all('a'):
        curr_string = str(link.get('href'))
        start_pos = curr_string.find('http')
        if start_pos != -1:
            curr_string = curr_string[start_pos:]
            url = urllib.parse.unquote(urllib.parse.unquote(curr_string))
            url_list.append(url)

    url_set = sorted(set(url_list), key=url_list.index)
    #url_set = set(url_list)
    print(url_set)
    return url_set
"""
Function: get all words...
Module Name: get_token_from_site
Return: All the words. 
Description: The site gets the url and then decodes to the utf-8.
After that parser is used to generate the html text alone and nltk is used to 
generate the word tokenize.
"""

def get_token_from_site(url):

    site = urllib.request.urlopen(url)
    html = site.read().decode('utf-8')
    ### download the html content
    raw = BeautifulSoup(html, "html.parser").get_text()
    #tokens = nltk.word_tokenize(raw)
    tokens = nltk.wordpunct_tokenize(raw)
    text = nltk.Text(tokens)
    #Tokenize the text.
    words = [w.lower() for w in text]
    vocab = sorted(set(words))
    #Normalize the words and separate out the

    print(vocab)
    return vocab
"""
Function: return all noun words...
Module Name: get_noun_from_site
Return: All nouns words. 
Description: The site gets the url and then decodes to the utf-8.
After that parser is used to generate the html text alone and nltk is used to 
generate the word tokenize.
"""


    

def get_noun_from_site(url):

    site = urllib.request.urlopen(url)
    html = site.read().decode('utf-8')
    ### download the html content
    raw = BeautifulSoup(html, "html.parser").get_text()
    #tokens = nltk.word_tokenize(raw)
    tokens = nltk.wordpunct_tokenize(raw)
    text = nltk.Text(tokens)
    #Tokenize the text.
    nouns = []
    for word, pos in nltk.pos_tag(text):
        if pos in ['NN', "NNP"]: # feel free to add any other noun tags
                nouns.append(word)
            
    #words = [w.lower() for w in text]
    #vocab = sorted(set(nouns))
    #counts = Counter(nouns)
    #Normalize the words and separate out the

    return nouns

def get_noun_from_site_counter(url):
    nouns = get_noun_from_site(url)
    counts = Counter(nouns)
    return counts
            

data = get_noun_from_site_counter("https://en.wikipedia.org/wiki/Nivin_Pauly")

print(data)
"""

"""
def get_places_list_from_site(url):
    site = urllib.request.urlopen(url)
    html = site.read().decode('utf-8')
    ### download the html content
    #raw = nltk.clean_html(html)

    raw = BeautifulSoup(html, "html.parser").get_text()
    #tokens = nltk.word_tokenize(raw)
    tokens = nltk.wordpunct_tokenize(raw)
    wordlist = nltk.Text(tokens)
    #print(wordlist)
    placesr = []
    cities = []
    for text in wordlist:
        places = GeoText(text)
        if len(places.countries) > 0:
            #print(places.countries)
            placesr += places.countries
        if len(places.cities) > 0:
            #print(places.cities)
            cities += places.cities
        #if len(places.)    
    return (placesr,cities)
def get_domain_name(url):
    return url.split("//")[-1].split("/")[0]

def parse_mobile_n_email_address(url):
    
    f = urllib.request.urlopen(url)
    s = f.read().decode('utf-8')
    #s = f.read()
    mobile_address = []
    
    aux = re.findall(r"\+\d{2}\s?0?\d{10}",s)
    if(len(aux) > 0):
        mobile_address.append(aux)
    aux = re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,4}",s)
    if(len(aux) > 0):
        mobile_address.append(aux)
    return mobile_address
def get_normalized_sentence(url):
    data = get_noun_from_site_counter(url)
    ls =  data.most_common()
    text = ""
    for each in range(min(55, len(ls))):
        if len(ls[each][0]) > 4: 
            text = text + ls[each][0] + " "
    return text
    
def classify(text, verbose=True):
    """Classify the input text into categories. """

    language_client = language.LanguageServiceClient()

    document = language.types.Document(
        content=text,
        type=language.enums.Document.Type.PLAIN_TEXT)
    response = language_client.classify_text(document)
    categories = response.categories

    result = {}

    for category in categories:
        # Turn the categories into a dictionary of the form:
        # {category.name: category.confidence}, so that they can
        # be treated as a sparse vector.
        result[category.name] = category.confidence

    if verbose:
        print(text)
        for category in categories:
            print(u'=' * 20)
            print(u'{:<16}: {}'.format('category', category.name))
            print(u'{:<16}: {}'.format('confidence', category.confidence))

    return result
#U0uTkkAWyI78keQMhGBf0lSC
query_word = "Nivin"
lis_data = get_santaized_results_from_duckduckgo(query_word)
sz = len(lis_data)
ma = min(10, len(lis_data))
j = i = 0

accum = []
placesacc = []
mobilesacc = []
while i < ma and j < sz:
    try:
        each = lis_data[j]
        print("url", each)
        anes = classify(get_normalized_sentence(each))
        data = get_noun_from_site(each)
        if len(data) > 0:
            accum += data
        places = get_places_list_from_site(each)
        if len(places) > 0:
            placesacc += places[0]
            placesacc += places[1]
        box = parse_mobile_n_email_address(each)
        if len(box) > 0:
            mobilesacc += box
        i+=1
    except:
        print("Authentication Required...")
    j+=1

#print(accum, placesacc, (mobilesacc))
placesacc = set(placesacc)
print("#### Places:", placesacc)      
print("### Mobile & Email Address", mobilesacc)

res = Counter(accum)



normalized_nouns = []
frequeny_vector = []
all_nouns = []
sume = 0
for k in res:
     if len(k) > 3:
         normalized_nouns.append((k,res[k]))
         frequeny_vector.append(res[k])
         all_nouns.append(k)
         sume += res[k]   
        
#print(normalized_nouns)
""" 
Scoring metric
VectorA dot VectorB dot VectorC = [Frequency Vector] dot [Confidence Vector] dot [Damage or Exploit Vector]
for simplicity, the confidence vector and damage vector are user fed. 
"""


sz = len(normalized_nouns)
confidence_vector = np.zeros(sz)
exploit_vector = np.zeros(sz)
user_defined_scored = np.zeros(sz)
score = np.zeros(sz)

 
y_pos = np.arange(len(all_nouns))
performance = [10,8,6,4,2,1]
 
plt.bar(y_pos, frequeny_vector, align='center', alpha=0.5)
plt.xticks(y_pos, all_nouns)
plt.ylabel('Frequency Count')
plt.title('Frequencey of Nouns')
plt.show()

for i in range(sz):
    confidence_vector[i] = 0.5
    frequeny_vector[i] /= sume
    exploit_vector[i] = 0.2
    user_defined_scored[i] = exploit_vector[i] * confidence_vector[i]

    
score = (np.dot(confidence_vector,user_defined_scored) / sz) * 4.0

print("Profile Tester Score is ", score)