{"id":752,"date":"2015-02-14T21:51:15","date_gmt":"2015-02-15T04:51:15","guid":{"rendered":"http:\/\/homepages.uc.edu\/~yaozo\/wordpress\/?p=752"},"modified":"2015-02-14T21:51:15","modified_gmt":"2015-02-15T04:51:15","slug":"convert-pdfs-to-text-files-or-csv-files-dfr-format-with-r","status":"publish","type":"post","link":"https:\/\/zhuoyao.net\/index.php\/2015\/02\/14\/convert-pdfs-to-text-files-or-csv-files-dfr-format-with-r\/","title":{"rendered":"Convert PDFs to text files or CSV files (DfR format) with R"},"content":{"rendered":"<pre class=\"\"># download pdftotxt from \n# ftp:\/\/ftp.foolabs.com\/pub\/xpdf\/xpdfbin-win-3.03.zip\n# and extract to your program files folder\n\n# here is a pdf for mining\nurl &lt;- \"http:\/\/www.noisyroom.net\/blog\/RomneySpeech072912.pdf\"\ndest &lt;- tempfile(fileext = \".pdf\")\ndownload.file(url, dest, mode = \"wb\")\n\n# set path to pdftotxt.exe and convert pdf to text\nexe &lt;- \"C:\\\\Program Files\\\\xpdfbin-win-3.03\\\\bin32\\\\pdftotext.exe\"\nsystem(paste(\"\\\"\", exe, \"\\\" \\\"\", dest, \"\\\"\", sep = \"\"), wait = F)\n\n# get txt-file name and open it  \nfiletxt &lt;- sub(\".pdf\", \".txt\", dest)\nshell.exec(filetxt); shell.exec(filetxt)    # strangely the first try always throws an error..\n\n\n# do something with it, i.e. a simple word cloud \nlibrary(tm)\nlibrary(wordcloud)\nlibrary(Rstem)\n\ntxt &lt;- readLines(filetxt) # don't mind warning..\n\ntxt &lt;- tolower(txt)\ntxt &lt;- removeWords(txt, c(\"\\\\f\", stopwords()))\n\ncorpus &lt;- Corpus(VectorSource(txt))\ncorpus &lt;- tm_map(corpus, removePunctuation)\ntdm &lt;- TermDocumentMatrix(corpus)\nm &lt;- as.matrix(tdm)\nd &lt;- data.frame(freq = sort(rowSums(m), decreasing = TRUE))\n\n# Stem words\nd$stem &lt;- wordStem(row.names(d), language = \"english\")\n\n# and put words to column, otherwise they would be lost when aggregating\nd$word &lt;- row.names(d)\n\n# remove web address (very long string):\nd &lt;- d[nchar(row.names(d)) &lt; 20, ]\n\n# aggregate freqeuncy by word stem and\n# keep first words..\nagg_freq &lt;- aggregate(freq ~ stem, data = d, sum)\nagg_word &lt;- aggregate(word ~ stem, data = d, function(x) x[1])\n\nd &lt;- cbind(freq = agg_freq[, 2], agg_word)\n\n# sort by frequency\nd &lt;- d[order(d$freq, decreasing = T), ]\n\n# print wordcloud:\nwordcloud(d$word, d$freq)\n\n# remove files\nfile.remove(dir(tempdir(), full.name=T)) # remove files<\/pre>\n","protected":false},"excerpt":{"rendered":"<p># download pdftotxt from # ftp:\/\/ftp.foolabs.com\/pub\/xpdf\/xpdfbin-win-3.03.zip # and extract to your program files folder # here is a pdf for mining url &lt;- &#8220;http:\/\/www.noisyroom.net\/blog\/RomneySpeech072912.pdf&#8221; dest&hellip; <\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"closed","ping_status":"closed","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[20],"tags":[],"class_list":["post-752","post","type-post","status-publish","format-standard","hentry","category-r"],"_links":{"self":[{"href":"https:\/\/zhuoyao.net\/index.php\/wp-json\/wp\/v2\/posts\/752","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/zhuoyao.net\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/zhuoyao.net\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/zhuoyao.net\/index.php\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/zhuoyao.net\/index.php\/wp-json\/wp\/v2\/comments?post=752"}],"version-history":[{"count":0,"href":"https:\/\/zhuoyao.net\/index.php\/wp-json\/wp\/v2\/posts\/752\/revisions"}],"wp:attachment":[{"href":"https:\/\/zhuoyao.net\/index.php\/wp-json\/wp\/v2\/media?parent=752"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/zhuoyao.net\/index.php\/wp-json\/wp\/v2\/categories?post=752"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/zhuoyao.net\/index.php\/wp-json\/wp\/v2\/tags?post=752"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}