makedictionary.pl


#!/usr/bin/perl

# Installation:
# 1 – save this file in your home directory
#     on Mac OS X or Linux;
# 2 – open the terminal
# 3 – make the file executable by typing the command:
#     chmod 700 makedictionary.pl

# Usage:
# 1 – save your text file in plain text format in your home directory
# 2 – open the terminal
# 3 – type and run the command:
#     ./makedictionary.pl < mytext.txt > mytext-wordlist.txt
#     (replace “my_text.txt” with the real name of your text file)
# 4 – view mytext-wordlist.txt in any text program or browser, import
#     it into Excel or any other spreadsheet or data visualization program
#     as “Text with Tabs”.

# Explanation:
# This script does the following:
# 1 – read any text file, breaking it up into single words and sorting them
#     into an alphabetical wordlist;
# 2 – compare the above wordlist to our dictionary of allowed words and
#     filter out the words that match;
# 3 – sort the filtered list by frequency of the (key)words used
# 4 – output this list as plain text with tab-separated fields.

# Dictionary of allowed words

$allowed_words=
“address
aesthetic
affair
affirm
after
again
agency
analog
analyz
apparent
apply
applied
art
artist
assume
big
call
capitalis
cause
change
clear
commerc
complex
concept
condition
critic
cultur
current
data
deleuze
denot
describ
develop
digit
dimension
distinct
diy
effect
enabl
environment
establish
everyday
exist
experien
experiment
fals
find
focus
force
function
gilles
histor
hold
idea
ideolog
image
immateria
impact
information
infra-ordinary
infrastruct
innovat
interact
interfac
investig
left
less
link
live
look
mainstream
market
marvel
material
media
medium
merge
messy
more
natur
network
new
normaliz
not-yet-actual
notion
object
old
other
paradig
past
people
percepti
politic
post
post-digital
potential
practic
precari
present
problem
process
produc
promis
question
real
recogn
refer
reflect
relat
repack
research
return
revolut
semant
sense
social
space
state
stream
structur
superior
system
technolog
tend
term
theor
thing
time
totalita
understand
use
virtual
way
world”;

# Write the above word list into a structured array
@allowed_words = split(“\n”,$allowed_words);

# read source text file, split it up into words and store it in an alphabetically sorted array
while (<STDIN>) {
# sanitize input
chomp;
# protect hyphens from Perl’s “non-word” regular expression filter
s/-/hyphenhyphenhyphen/g;
# filter out “non-word” characters
s/[\W]/ /g;
# restore hyphens
s/hyphenhyphenhyphen/-/g;
# split up into single lowercase words & append to word list
push @unfiltered_wordlist, split (/[\s]+/, lc($_));
}

@unfiltered_wordlist=sort(@unfiltered_wordlist);

# match dictionary against allowed words, write matching words into new array

foreach $word(@unfiltered_wordlist) {
$match_flag = 0;
foreach $allowed_word(@allowed_words) {
if (crude_root($word) eq $allowed_word) {
$match_flag = 1;
}
}
if ($match_flag == 1) {
# print $word,”\n”;
push @filtered_wordlist, $word;
}
}

$dict = (join “\n”, @filtered_wordlist);
system (“echo \”$dict\” | uniq -c |sort -n | sed -e \”s/^[\t ]*//\” | sed -e \”s/ [ ]*/\t/g\” “);

# This subroutine uses some crude filtering to reduce English words to their
# linguistic roots (in order to improve matching probability)

sub crude_root {
my $word = $_[0];
$word =~ s/ing$//;
$word =~ s/ied$//;
$word =~ s/y$//;
$word =~ s/ed$//;
$word =~ s/e$//;
$word =~ s/es$//;
$word =~ s/s$//;
return $word;
}

Posted in Post-digital Research