0%

Sparkclouds

Sparkclouds

实现思路

找出 Shakespeares’s Sonnet 1-20, 存为 .txt 文件,但后进行以下操作:

统计词频

用 C++ 写个程序,统计词频,能处理大小写、标点符号、’s 后缀等,代码如下:

#include <iostream>
#include <fstream>
#include <string>
#include <map>

typedef std::map<std::string, int> StrIntMap;

char marks[] = { ',', '.', ':', '?', '!', ';' };

void wordCnt(std::istream& in, StrIntMap& words) {
std::string s;
while (in >> s) {
if (s.back() == '\0' || s.back() == '\n')
s.erase(s.end()-1);
for (int i = 0; i < 6; i++) {
if (s.back() == marks[i]) {
s.erase(s.end()-1);
}
}
if (s[0] >= 'A' && s[0] <= 'Z') {
bool changed = true;
if (s[0] == 'I') {
if (s.size() == 0)
changed = false;
else if (s[1] < 'a' || s[1] > 'z')
changed = false;
}
else if (s.size() > 1) {
for (int i = 0; i < s.size(); i++) {
if (s[i] < 'A' || s[i]>'Z') {
changed = true;
break;
}
else
changed = false;
}
}

if (changed)
s[0] += ('a' - 'A');
}
if (s.back() == 's' && s.size() > 2) {
if (s[s.size() - 2] == '\'') {
s.erase(s.end()-1);
s.erase(s.end()-1);
}
}
//std::cout << s << "\t";
++words[s];
}
}

int main(int argc, char** argv) {
if (argc < 2)
return(EXIT_FAILURE);
std::ifstream in(argv[1]);
if (!in) {
std::cout << "open error\n";
exit(EXIT_FAILURE);
}

std::string outname = argv[1];
outname += ".csv";

std::fstream opt;
opt.open(outname, std::fstream::in | std::fstream::out | std::fstream::app);
if (!opt) {
std::cout << "fail\n";
exit(EXIT_FAILURE);
}

StrIntMap w;
wordCnt(in, w);

in.close();

for (StrIntMap::iterator p = w.begin(); p != w.end(); ++p) {
opt << p->first << "," << p->second << "\n";
}
opt.close();

return 0;
}

然后处理那 20 个 .txt 文档:

image.png

当然还有一个 all.txt 用于统计所有词频。

整理数据

先手动处理了古英语的 ' 前缀缩写。

用 Excel 的 VLOOKUP 函数整合统计所有 .csv 文件,最终生成一个:

image.png

D3 实现

对于总词频低的单词,统一用小字号;词频高的,根据词频调整字号,并在下方根据 20 个 Sonnets 各自出现频率画出 sparkline 。

根据字号、单词长度,初略计算下一个单词应该出现的坐标。对于词频高的预留大空间。每个新加的单词根据占用空间进行判断位置。

效果图

image.png

Source Code

<html>
<meta charset = "utf-8">
<script src = "https://d3js.org/d3.v4.js"></script>
<div id = "word_cloud"></div>
<script>
var margin = {top: 20, right: 20, bottom: 20, left: 20};
var width = 800 - margin.left - margin.right,
height = 600 - margin.top - margin.bottom;

var svg = d3.select("#word_cloud").append("svg")
.attr("width", width + margin.left + margin.right)
.attr("height", height + margin.top + margin.bottom)
.append("g")
.attr("transform", "translate(" + margin.left + "," + margin.top + ")");

var positionTaken = new Array(width + 101);
for (var i = 0; i < positionTaken.length; i++)
positionTaken[i] = new Array(height + 101);

for (var i = 0; i < positionTaken.length; i++) {
for (var j = 0; j < positionTaken[i].length; j++) {
positionTaken[i][j] = 0;
}
}
console.log(positionTaken);

d3.csv("https://raw.githubusercontent.com/Co10/d3_files/master/CSV_files/04/0/20_sonets_wordcount.csv", function(data) {
console.log("data", data);
var x0 = margin.left, y0 = margin.top;
var thisline = 0;

var lineFunc = d3.line()
.x(function(d) { return x0 + d.x; })
.y(function(d) { return y0 + d.y; })

for (var i = 0; i < data.length; i++) {
var wordsize = "6px", wordcolor = "#ACAEE3", bigword = 0;
var tempsize = 0;
if (data[i]["SUM"] > 6) {
tempsize = data[i]["SUM"];
wordsize = tempsize + "px";
wordcolor = "#254170";
bigword = 1;
thisline = Math.max(thisline, tempsize);
var iii = 0, jjj = 0;
for (iii = x0; iii <= x0 + data[i]["word"].length*0.5*(+data[i]["SUM"]); iii++) {
for (jjj = y0; jjj <= y0 + data[i]["word"].length*(+data[i]["SUM"]/5); jjj++) {
if ((iii <= width + 10) && (jjj <= height + 10))
positionTaken[iii][jjj] = 1;
}
}
}
else {
for (iii = x0; iii <= x0 + 3*(data[i]["word"].length); iii++) {
for (jjj = y0; jjj <= y0 + 8; jjj++) {
if ((iii <= width + 10) && (jjj <= height + 10))
positionTaken[iii][jjj] = 1;
}
}
}
svg.append("text")
.attr("x", x0).attr("y", y0)
.text(data[i]["word"])
.style("font-size", wordsize)
.style("fill", wordcolor)
//.attr("text-anchor", "start")
.attr("alignment-baseline", "central")

if (bigword == 1) {
var newdata = [];
var x = 0;
for (var j = 1; j < 21; j++) {
var tempS = "S";
tempS += j;
if (j > 1)
x += (tempsize*data[i]["word"].length*0.5 / 20);
var y = tempsize/2 - (+data[i][tempS]);
newdata.push({x, y});
}
//console.log(newdata);
svg.append("path")
.attr("d", lineFunc(newdata))
.attr("stroke", "#254170")
.attr("fill", "none");

svg.append("line")
.attr("x1", x0 + newdata[0].x)
.attr("y1", y0 + tempsize/2)
.attr("x2", x0 + newdata[19].x)
.attr("y2", y0 + tempsize/2)
.attr("stroke", "#254170")
}

var delta_x = 0;
for (var ii = x0; ii <= width; ii++) {
if (positionTaken[ii][y0] == 1)
delta_x++;
else
break;
}
x0 += delta_x;

if (x0 >= (width - margin.right)) {
x0 = margin.left;
y0 += 8;
//if (thisline > 0) {
//y0 += (thisline/2 - 8);
//thisline = 0;
//}
}
}
})

</script>
</html>