R code: clustering
# K-means clustering
# first example: made-up data
# Draw 100 numbers, normally distributed, with a mean of 0,
# and organize them in 2 columns so we have 50 points with 2 coordinates each.
cl1 = matrix(rnorm(100, sd = 0.3), ncol = 2)
# Draw 100 numbers with a mean of 1.5,
# again organized in 2 columns for 50 datapoints with 2 coordinates each
cl2 = matrix(rnorm(100, mean=1.5, sd = 0.3), ncol = 2)
# plotting both
plot(cl1, xlim = range(cl1, cl2), ylim = range(cl1, cl2))
points(cl2)
# combine the two sets of points, and let's see
# if the clustering method can take them apart again
data = rbind(cl1, cl2)
# doing the clustering
clustering = kmeans(data, 2)
# inspecting the result
clustering
# this looks something like this.
# "Cluster means" are the cluster centers,
# one center per line.
# "Clustering vector" assigns each datapoint in "data"
# a cluster, 1 or 2.
# "Within cluster sum of squares by cluster" is the sum of squared distances
# of the data points from their cluster mean, by cluster.
##
# K-means clustering with 2 clusters of sizes 50, 50
#
# Cluster means:
## [,1] [,2]
## 1 0.01778026 0.01465019
## 2 1.47658108 1.46337011
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [45] 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [89] 2 2 2 2 2 2 2 2 2 2 2 2
## Within cluster sum of squares by cluster:
## [1] 9.814072 9.629475
## (between_SS / total_SS = 84.5 %)
## Available components:
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss" "betweenss"
## [7] "size" "iter" "ifault"
# how good is the clustering
# withinss: sum of squared distances between each datapoint and its
# cluster mean
# betweenss: sum of squared distances between each cluster mean and
# the data mean
clustering$withinss
clustering$betweenss
# visualizing the results of clustering
plot(data, col = clustering$cluster, pch=clustering$cluster)
# adding the cluster centers
points(clustering$centers, col = 1:2, pch = 8, cex = 2)
# for a more challenging dataset, set the second mean to 1.0,
# or try 3-4 clusters
# In that case do nstart = 25
#######
# K-means clustering, real dataset
# NYT opinion articles
nyt = read.table("nyt_opinion.txt", header=T)
content.words = nyt[,13:29]
# do the clustering of articles by content words.
# 4 clusters, 25 attempts with different random starting points.
nyt.cl = kmeans(content.words, 4, nstart = 25)
# inspecting the clusters
nyt$cl = nyt.cl$cluster
# How do the authors distribute over the clusters?
xtabs(~nyt$cl + nyt$Author)
# how do content word counts look like by cluster?
aggregate(content.words, list(nyt$cl), sum)
#############
# Hierarchical clustering
# clustering languages by similarity of morphosyntactic characteristics
library(languageR)
# make an object with pairwise distances of languages
dist.languages = dist(phylogeny[3:ncol(phylogeny)], method="binary")
# do the clustering, plot
# Suggestion: try out methods "ward.D", "ward.D2", "single", "complete", "average" and see how
# the clustering changes.
clustering = hclust(dist.languages, method="ward.D")
plot(clustering)
# make it so we can see which languages got clustered
plotnames = as.character(phylogeny$Language)
plot(clustering, labels=plotnames, cex=0.8)
###############
# Dimensionality reduction:
# Code from the Baayen book
library(languageR)
# affixProductivity dataset: stems can be combined with different
# affixes, for example warm+th, clever+ness
#
# Question: affix productivity related to stylistic factors?
# affixProductivity dataset lists, for 44 texts with varying
# authors and genres, a productivity index for 27 affixes. Genres:
# religious, children's books, literary texts, and others.
# Here: use PCA to explore what dimensions give relevant information
# about the data. All columns except the last three have affixes. Last three
# columns encode info about author, genre, and birth year of author.
# doing the principal component analysis (PCA)
# run with scale=T, as PCA is sensitive to
# different values being on different scales
affixes.pr = prcomp(affixProductivity[,1:(ncol(affixProductivity)-3)], scale=T)
# what do we have in the resulting object?
names(affixes.pr)
# Result:
# [1] "sdev" "rotation" "center" "scale" "x"
# summary(affixes.pr) prints the standard deviation for each
# principal component, and the amount of variance explained by
# it. Proportion of variance: squared sdev divided by sum of squared
# sdevs. standard deviation is also accessible via
# affixes.pr$sdev
summary(affixes.pr)
# this is the cumulative variance explained for principal components 1-k.
# This is in the summary, but you can also get it as a vector like this:
affixes.pr$sdev^2 / sum(affixes.pr$sdev^2)
# Rules of thumb:
# * keep only principal components that account for at
# least 5% of the variance.
# * visualize proportion of variance explained by each principal
# component. Only keep components up to first major drop in the graph
# visualization:
plot(affixes.pr)
# The original dataset affixProductivity has one book per line.
# Each of these books has coordinates on the new principal components.
# Here are the coordinates on the first three principal components
# for some books
affixes.pr$x[c("Mormon", "Austen", "Carroll", "Gao"), 1:3]
# How can we interpret principal components?
# One way is to look at the ``loadings'' of the affixes on the
# principal components. Flip the matrix around, view these loadings as
# coordinates of affixes with new dimensions.
# Interpret a principal component by checking which affixes
# have a large positive or negative value with it.
# Loadings are available in affixes.pr$rotation
affixes.pr$rotation[1:10, 1:3]
####
# PCA as soft clustering
# texts that have high values on the same
# principal dimensions, or affixes that have high loadings on the same
# principal dimensions, can be viewed as more or less clustered. Each
# principal component can then be viewed as a soft cluster.
####
# Visualization
# Visualize clustering of texts and clustering of affixes in the same
# plot: biplot.
biplot(affixes.pr, scale = 0, var.axes = F,
col = c("darkgrey", "black"), cex = c(0.9, 1.2))