import pandas as pd
import numpy as np
cik_list = pd.read_excel("./cik_list.xlsx")
max_row, max_col = cik_list.shape
print(max_row)
pd.set_option('display.max_colwidth',100) # to display full text in column
cik_list.head()
cik_list.SECFNAME.head()
#adding the initial structure of the link in secfname
link = 'https://www.sec.gov/Archives/'
cik_list.SECFNAME = link+cik_list.SECFNAME
cik_list.SECFNAME.head()
For each report, we would need three sections
ITEM (section_number). section_name (start)
section_content (body)
ITEM (next_section_number) or SIGNATURES section (if section is the last one) (end)
If the form type starts with "NT" the theres is not data in the form so we dont need to go through them
#varies imports
import requests
import re, string, unicodedata
import nltk
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
#making the stopword set from basic english and the given list of stopwords
nltk.download('stopwords')
nltk.download('punkt')
stopset = set(w.upper() for w in stopwords.words('english'))
#adding more stopwords from text file of stopwords
import glob
path = "StopWords*.txt"
glob.glob(path)
for filename in glob.glob(path):
with open(filename, 'r') as f:
text = f.read()
text = re.sub(r"\s+\|\s+[\w]*" , "", text)
stopset.update(text.upper().split())
#print(len(stopset))
# syllables count (will be used in complex word count)
from nltk.corpus import cmudict
nltk.download('cmudict')
d = cmudict.dict()
def syllables(word):
#referred from stackoverflow.com/questions/14541303/count-the-number-of-syllables-in-a-word
count = 0
vowels = 'aeiouy'
word = word.lower()
if word[0] in vowels:
count +=1
for index in range(1,len(word)):
if word[index] in vowels and word[index-1] not in vowels:
count +=1
if word.endswith('e'):
count -= 1
if word.endswith('le'):
count+=1
if count == 0:
count +=1
return count
def nsyl(word):
try:
return max([len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]])
except KeyError:
#if word not found in cmudict
return syllables(word)
# other usefull functions
def strip_html(text):
soup = BeautifulSoup(text, "html.parser")
return soup.get_text()
def remove_between_square_brackets(text):
return re.sub('\[[^]]*\]', '', text)
def remove_digits(text):
return re.sub('[\d%/$]', '', text)
def denoise_text(text):
text = strip_html(text)
text = remove_between_square_brackets(text)
text = remove_digits(text)
return text
def remove_non_ascii(words):
"""Remove non-ASCII characters from list of tokenized words"""
new_words = []
for word in words:
new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
new_words.append(new_word)
return new_words
def to_upper_case(words):
"""Convert all characters to lowercase from list of tokenized words"""
new_words = []
for word in words:
new_word = word.upper()
new_words.append(new_word)
return new_words
def remove_punctuation(words):
"""Remove punctuation from list of tokenized words"""
new_words = []
for word in words:
new_word = re.sub(r'[^\w\s]', '', word)
if new_word != '':
new_words.append(new_word)
return new_words
def remove_stopwords(words):
"""Remove stop words from list of tokenized words"""
new_words = []
for word in words:
if word not in stopset:
new_words.append(word)
return new_words
# def stem_words(words):
# """Stem words in list of tokenized words"""
# stemmer = LancasterStemmer()
# stems = []
# for word in words:
# stem = stemmer.stem(word)
# stems.append(stem)
# return stems
# def lemmatize_verbs(words):
# """Lemmatize verbs in list of tokenized words"""
# lemmatizer = WordNetLemmatizer()
# lemmas = []
# for word in words:
# lemma = lemmatizer.lemmatize(word, pos='v')
# lemmas.append(lemma)
# return lemmas
def normalize(words):
words = remove_non_ascii(words)
words = to_upper_case(words)
words = remove_punctuation(words)
# words = replace_numbers(words)
words = remove_stopwords(words)
return words
# def stem_and_lemmatize(words):
# stems = stem_words(words)
# lemmas = lemmatize_verbs(words)
# return stems, lemmas
# section names
MDA = "Management's Discussion and Analysis"
QQDMR = "Quantitative and Qualitative Disclosures about Market Risk"
RF = "Risk Factors"
section_name = ['MDA','QQDMR',"RF"]
section = [MDA.upper(),QQDMR.upper(),RF.upper()]
variables = ['positive_score','negative_score','polarity_score','average_sentence_length', 'percentage_of_complex_words',\
'fog_index','complex_word_count','word_count','uncertainty_score','constraining_score', 'positive_word_proportion',\
'negative_word_proportion', 'uncertainty_word_proportion', 'constraining_word_proportion' ]
import itertools
constraining_words_whole_report = pd.Series(name='constraining_words_whole_report')
df_col = [sec.lower() + '_' + var for sec,var in itertools.product(section_name,variables) ]
df = pd.DataFrame(columns=df_col)
df.shape
#usefull dictionaries
master_dict = pd.read_csv('./LoughranMcDonald_MasterDictionary_2016.csv', index_col= 0)
constraining_dict = set(pd.read_excel('./constraining_dictionary.xlsx',index_col = 0).index)
uncertainty_dict = set(pd.read_excel('./uncertainty_dictionary.xlsx', index_col = 0).index)
cik_list.loc[64]
# # saving all forms locally
# for i in range(max_row):
# text = requests.get(cik_list.SECFNAME[i]).text
# file_name = 'form' + str(i)
# f = open(file_name, 'a+')
# f.write(text)
# f.close()
for i in range(max_row):
#print(i)
file_name = './form/form' + str(i)
text = open(file_name,'r').read()
print('reading..',end = " ")
#constraining_words_whole_report
# constraining_words_whole_report_count = 0
# for word in denoise_text(text).split():
# if word in constraining_dict:
# constraining_words_whole_report_count += 1
# print('here...',end = " ")
# constraining_words_whole_report.loc[i] = constraining_words_whole_report_count
####################################
df.loc[i] = np.zeros(42)
# other variable per sections
for j in range(3):
if i in [63,64]:
continue
print(i,j,sep= '|',end = " ")
exp = r".*(?P<start>ITEM [\d]\. " + re.escape(section[j]) + r")(?P<MDA>.*)(?P<body>[\s\S]*)(?P<end>ITEM \d|SIGNATURES)"
regexp = re.compile(exp)
s = regexp.search(text)
if s:
data = s.group('body')
text = denoise_text(data)
sent_list = sent_tokenize(text)
sentence_length = len(sent_list)
sample = text.split()
sample = normalize(sample)
word_count = len(sample)
complex_word_count = 0
for word in sample:
if nsyl(word.lower()) > 2:
complex_word_count += 1
average_sentence_length = word_count/sentence_length
percentage_of_complex_words = complex_word_count/word_count
fog_index = 0.4 * (average_sentence_length + percentage_of_complex_words)
positive_score = 0
negative_score = 0
uncertainty_score = 0
constraining_score = 0
for word in sample:
if word in master_dict.index:
#print("is here")
if master_dict.loc[word].Positive > 0:
#print("positive")
positive_score += 1
if master_dict.loc[word].Negative > 0:
negative_score += 1
if word in uncertainty_dict:
uncertainty_score += 1
if word in constraining_dict:
constraining_score += 1
#print(positive_score)
polarity_score = (positive_score-negative_score)/(positive_score + negative_score + .000001)
positive_word_proportion = positive_score/word_count
negative_word_proportion = negative_score/word_count
uncertainty_word_proportion = uncertainty_score/word_count
constraining_word_proportion = constraining_score/word_count
df.loc[i][section_name[j].lower() + "_positive_score"] = positive_score
df.loc[i][section_name[j].lower() + "_negative_score"] = negative_score
df.loc[i][section_name[j].lower() + "_polarity_score"] = polarity_score
df.loc[i][section_name[j].lower() + "_average_sentence_length"] = average_sentence_length
df.loc[i][section_name[j].lower() + "_percentage_of_complex_words"] = percentage_of_complex_words
df.loc[i][section_name[j].lower() + "_fog_index"] = fog_index
df.loc[i][section_name[j].lower() + "_complex_word_count"] = complex_word_count
df.loc[i][section_name[j].lower() + "_word_count"] = word_count
df.loc[i][section_name[j].lower() + "_uncertainty_score"] = uncertainty_score
df.loc[i][section_name[j].lower() + "_constraining_score"] = constraining_score
df.loc[i][section_name[j].lower() + "_positive_word_proportion"] = positive_word_proportion
df.loc[i][section_name[j].lower() + "_negative_word_proportion"] = negative_word_proportion
df.loc[i][section_name[j].lower() + "_uncertainty_word_proportion"] = uncertainty_word_proportion
df.loc[i][section_name[j].lower() + "_constraining_word_proportion"] = constraining_word_proportion
for i in range(max_row):
print(i,end = " ")
file_name = './form/form' + str(i)
text = open(file_name,'r').read()
print('reading..',end = " ")
#constraining_words_whole_report
constraining_words_whole_report.loc[i] = 0
constraining_words_whole_report_count = 0
for word in denoise_text(text).split():
if word in constraining_dict:
constraining_words_whole_report_count += 1
print('here...',end = " ")
constraining_words_whole_report.loc[i] = constraining_words_whole_report_count
# joing the files for output formate
df = pd.concat([cik_list,df,constraining_words_whole_report], axis = 1)
df.shape
df.head(10)
writer = pd.ExcelWriter('./output.xlsx')
df.to_excel(writer, sheet_name='output')