R code: clustering

# K-means clustering

# first example: made-up data

# Draw 100 numbers, normally distributed, with a mean of 0,

# and organize them in 2 columns so we have 50 points with 2 coordinates each.

cl1 = matrix(rnorm(100, sd = 0.3), ncol = 2)

# Draw 100 numbers with a mean of 1.5,

# again organized in 2 columns for 50 datapoints with 2 coordinates each

cl2 = matrix(rnorm(100, mean=1.5, sd = 0.3), ncol = 2)

# plotting both

plot(cl1, xlim = range(cl1, cl2), ylim = range(cl1, cl2))

points(cl2)

# combine the two sets of points, and let's see

# if the clustering method can take them apart again

data = rbind(cl1, cl2)

# doing the clustering

clustering = kmeans(data, 2)

# inspecting the result

clustering

# this looks something like this.

# "Cluster means" are the cluster centers,

# one center per line.

# "Clustering vector" assigns each datapoint in "data"

# a cluster, 1 or 2.

# "Within cluster sum of squares by cluster" is the sum of squared distances

# of the data points from their cluster mean, by cluster.

##

# K-means clustering with 2 clusters of sizes 50, 50

#

# Cluster means:

##         [,1]       [,2]

## 1 0.01778026 0.01465019

## 2 1.47658108 1.46337011

## Clustering vector:

##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

##  [45] 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2

##  [89] 2 2 2 2 2 2 2 2 2 2 2 2

## Within cluster sum of squares by cluster:

## [1] 9.814072 9.629475

##  (between_SS / total_SS =  84.5 %)

## Available components:

## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss" "betweenss"  

## [7] "size"         "iter"         "ifault"     

# how good is the clustering

# withinss: sum of squared distances between each datapoint and its

# cluster mean

# betweenss: sum of squared distances between each cluster mean and

# the data mean

clustering$withinss

clustering$betweenss

# visualizing the results of clustering

plot(data, col = clustering$cluster, pch=clustering$cluster)

# adding the cluster centers

points(clustering$centers, col = 1:2, pch = 8, cex = 2)

# for a more challenging dataset, set the second mean to 1.0,

# or try 3-4 clusters

# In that case do nstart = 25

#######

# K-means clustering, real dataset

# NYT opinion articles

nyt = read.table("nyt_opinion.txt", header=T)

content.words = nyt[,13:29]

# do the clustering of articles by content words.

# 4 clusters, 25 attempts with different random starting points.

nyt.cl = kmeans(content.words, 4, nstart = 25)

# inspecting the clusters

nyt$cl = nyt.cl$cluster

# How do the authors distribute over the clusters?

xtabs(~nyt$cl + nyt$Author)

# how do content word counts look like by cluster?

aggregate(content.words, list(nyt$cl), sum)

#############

# Hierarchical clustering

# clustering languages by similarity of morphosyntactic characteristics

library(languageR)

# make an object with pairwise distances of languages

dist.languages = dist(phylogeny[3:ncol(phylogeny)], method="binary")

# do the clustering, plot

# Suggestion: try out methods "ward.D", "ward.D2", "single", "complete", "average" and see how

# the clustering changes.

clustering = hclust(dist.languages, method="ward.D")

plot(clustering)

# make it so we can see which languages got clustered

plotnames = as.character(phylogeny$Language)

plot(clustering, labels=plotnames, cex=0.8)

###############

# Dimensionality reduction:

# Code from the Baayen book

library(languageR)

# affixProductivity dataset: stems can be combined with different

# affixes, for example warm+th, clever+ness

#

# Question: affix productivity related to stylistic factors?

# affixProductivity dataset lists, for 44 texts with varying

# authors and genres, a productivity index for 27 affixes. Genres:

# religious, children's books, literary texts, and others.

# Here: use PCA to explore what dimensions give relevant information

# about the data. All columns except the last three have affixes. Last three

# columns encode info about author, genre, and birth year of author.

# doing the principal component analysis (PCA)

# run with scale=T, as PCA is sensitive to

# different values being on different scales

affixes.pr = prcomp(affixProductivity[,1:(ncol(affixProductivity)-3)], scale=T)

# what do we have in the resulting object?

names(affixes.pr)

# Result:

# [1] "sdev"     "rotation" "center"   "scale"    "x"

# summary(affixes.pr) prints the standard deviation for each

# principal component, and the amount of variance explained by

# it. Proportion of variance: squared sdev divided by sum of squared

# sdevs. standard deviation is also accessible via

# affixes.pr$sdev

summary(affixes.pr)

# this is the cumulative variance explained for principal components 1-k.

# This is in the summary, but you can also get it as a vector like this:

affixes.pr$sdev^2 / sum(affixes.pr$sdev^2)

# Rules of thumb:

# * keep only principal components that account for at

#    least 5% of the variance.

# * visualize proportion of variance explained by each principal

#   component. Only keep components up to first major drop in the graph

# visualization:

plot(affixes.pr)

# The original dataset affixProductivity has one book per line.

# Each of these books has coordinates on the new principal components.

# Here are the coordinates on the first three principal components

# for some books

affixes.pr$x[c("Mormon", "Austen", "Carroll", "Gao"), 1:3]

# How can we interpret principal components?

# One way is to look at the ``loadings'' of the affixes on the

# principal components. Flip the matrix around, view these loadings as

# coordinates of affixes with new dimensions.

# Interpret a principal component by checking which affixes

# have a large positive or negative value with it.

# Loadings are available in affixes.pr$rotation

affixes.pr$rotation[1:10, 1:3]

####

# PCA as soft clustering

# texts that have high values on the same

# principal dimensions, or affixes that have high loadings on the same

# principal dimensions, can be viewed as more or less clustered. Each

# principal component can then be viewed as a soft cluster.

####

# Visualization

# Visualize clustering of texts and clustering of affixes in the same

# plot: biplot.

biplot(affixes.pr, scale = 0, var.axes = F,

col = c("darkgrey", "black"), cex = c(0.9, 1.2))