Add skipped files to a log. Fixed errors with non-existing images

online.
pull/1/head
sigonasr2 5 years ago
parent e1f3fa66fc
commit f84e1ae456
  1. BIN
      Tagger/Tagger.jar
  2. 183
      Tagger/src/PixivManager.java
  3. 14
      Tagger/src/utils.java

Binary file not shown.

@ -1,5 +1,6 @@
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.net.URL;
@ -28,6 +29,8 @@ public class PixivManager {
}
}
}
File skippedItems = new File("skippedItems.txt");
skippedItems.delete();
folder.mkdirs();
File outputTest = new File("TAG_DATA.txt");
FileWriter fwOutput;
@ -42,104 +45,114 @@ public class PixivManager {
if (!new File("downloadedData/temp"+s+".html").exists()) {
System.out.println("Starting download of "+url+" ...");
utils.downloadFileFromUrl(url, "downloadedData/temp"+s+".html");
String[] data = utils.readFromFile("downloadedData/temp"+s+".html");
int scriptEndLine = 0;
while (scriptEndLine<data.length) {
if (data[scriptEndLine].contains("<meta name=\"preload-data\" id=\"meta-preload-data\" content='")) {
System.out.println("Found JSON Target line at line "+scriptEndLine+". :: "+data[scriptEndLine] );
break;
if (new File("downloadedData/temp"+s+".html").exists()) {
String[] data = utils.readFromFile("downloadedData/temp"+s+".html");
int scriptEndLine = 0;
while (scriptEndLine<data.length) {
if (data[scriptEndLine].contains("<meta name=\"preload-data\" id=\"meta-preload-data\" content='")) {
System.out.println("Found JSON Target line at line "+scriptEndLine+". :: "+data[scriptEndLine] );
break;
}
scriptEndLine++;
}
scriptEndLine++;
}
if (scriptEndLine==data.length) {
System.out.println("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!");
System.out.println(" IMAGE "+s+" FAILED TO PARSE CORRECTLY! Something is messed up about the file!!");
System.out.println("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!");
}
File finaldata = new File("finaltemp");
FileWriter fw;
try {
fw = new FileWriter(finaldata);
BufferedWriter bw = new BufferedWriter(fw);
System.out.println(data[scriptEndLine]);
int cutpos = data[scriptEndLine].indexOf("<meta name=\"preload-data\" id=\"meta-preload-data\" content='")+58;
System.out.println(data[scriptEndLine].length()+"///"+data[scriptEndLine].indexOf("}}}'>")+"///"+cutpos);
if (cutpos<data[scriptEndLine].length()) {
bw.write(data[scriptEndLine].substring(cutpos,data[scriptEndLine].indexOf("}}}'>")+3));
System.out.println(data[scriptEndLine].substring(cutpos,data[scriptEndLine].indexOf("}}}'>")+3));
if (scriptEndLine==data.length) {
System.out.println("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!");
System.out.println(" IMAGE "+s+" FAILED TO PARSE CORRECTLY! Something is messed up about the file!!");
System.out.println("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!");
}
bw.close();
fw.close();
} catch (IOException e) {
e.printStackTrace();
}
JSONObject jsonData = utils.readJsonFromFile("finaltemp");
//System.out.println(Arrays.deepToString(JSONObject.getNames(jsonData.getJSONObject("preload"))));
//System.out.println(Arrays.deepToString(JSONObject.getNames(jsonData.getJSONObject("preload").getJSONObject("illust"))));
JSONArray tagsArray = jsonData.getJSONObject("illust").getJSONObject(s).getJSONObject("tags").getJSONArray("tags");
for (int i=0;i<tagsArray.length();i++) {
boolean hasEnglishTag=false;
JSONObject tag = tagsArray.getJSONObject(i);
String ENTag="";
//String romaji="";
/*if (tag.has("romaji") && !tag.isNull("romaji")) {
romaji = tag.getString("romaji");
}*/
if (tag.has("translation")) {
JSONObject translationObj = tag.getJSONObject("translation");
if (translationObj.has("en")) {
hasEnglishTag=true;
ENTag = translationObj.getString("en");
File finaldata = new File("finaltemp");
FileWriter fw;
try {
fw = new FileWriter(finaldata);
BufferedWriter bw = new BufferedWriter(fw);
System.out.println(data[scriptEndLine]);
int cutpos = data[scriptEndLine].indexOf("<meta name=\"preload-data\" id=\"meta-preload-data\" content='")+58;
System.out.println(data[scriptEndLine].length()+"///"+data[scriptEndLine].indexOf("}}}'>")+"///"+cutpos);
if (cutpos<data[scriptEndLine].length()) {
bw.write(data[scriptEndLine].substring(cutpos,data[scriptEndLine].indexOf("}}}'>")+3));
System.out.println(data[scriptEndLine].substring(cutpos,data[scriptEndLine].indexOf("}}}'>")+3));
}
} else
if (tag.has("tag") && /*romaji.length()==0 &&*/ !tag.getString("tag").matches(".*[ぁ-んァ-ン一-龯]")) {
hasEnglishTag=true;
ENTag = tag.getString("tag");
}
if (ENTag.replaceAll("\\?", "").trim().length()==0) {
ENTag="";
hasEnglishTag=false;
bw.close();
fw.close();
} catch (IOException e) {
e.printStackTrace();
}
boolean tagSubmitted=false;
String insertedTag="";
if (hasEnglishTag && ENTag.length()>0) {
insertedTag = ENTag;
tagSubmitted=true;
} /*else
if (romaji.length()>0){
insertedTag = romaji;
tagSubmitted=true;
}*/
if (tagSubmitted) {
if (imageTag.tag_whitelist.size()==0 || imageTag.tag_whitelist.containsKey(insertedTag.toLowerCase())) {
if (imageTag.taglist.containsKey(s)) {
List<String> tags = imageTag.taglist.get(s);
tags.add(insertedTag);
imageTag.taglist.put(s, tags);
} else {
List<String> tags = new ArrayList<String>();
tags.add(insertedTag);
imageTag.taglist.put(s,tags);
JSONObject jsonData = utils.readJsonFromFile("finaltemp");
//System.out.println(Arrays.deepToString(JSONObject.getNames(jsonData.getJSONObject("preload"))));
//System.out.println(Arrays.deepToString(JSONObject.getNames(jsonData.getJSONObject("preload").getJSONObject("illust"))));
JSONArray tagsArray = jsonData.getJSONObject("illust").getJSONObject(s).getJSONObject("tags").getJSONArray("tags");
for (int i=0;i<tagsArray.length();i++) {
boolean hasEnglishTag=false;
JSONObject tag = tagsArray.getJSONObject(i);
String ENTag="";
//String romaji="";
/*if (tag.has("romaji") && !tag.isNull("romaji")) {
romaji = tag.getString("romaji");
}*/
if (tag.has("translation")) {
JSONObject translationObj = tag.getJSONObject("translation");
if (translationObj.has("en")) {
hasEnglishTag=true;
ENTag = translationObj.getString("en");
}
if (imageTag.tagCounter.containsKey(insertedTag)) {
imageTag.tagCounter.put(insertedTag,imageTag.tagCounter.get(insertedTag)+1);
} else {
imageTag.tagCounter.put(insertedTag,1);
} else
if (tag.has("tag") && /*romaji.length()==0 &&*/ !tag.getString("tag").matches(".*[ぁ-んァ-ン一-龯]")) {
hasEnglishTag=true;
ENTag = tag.getString("tag");
}
if (ENTag.replaceAll("\\?", "").trim().length()==0) {
ENTag="";
hasEnglishTag=false;
}
boolean tagSubmitted=false;
String insertedTag="";
if (hasEnglishTag && ENTag.length()>0) {
insertedTag = ENTag;
tagSubmitted=true;
} /*else
if (romaji.length()>0){
insertedTag = romaji;
tagSubmitted=true;
}*/
if (tagSubmitted) {
if (imageTag.tag_whitelist.size()==0 || imageTag.tag_whitelist.containsKey(insertedTag.toLowerCase())) {
if (imageTag.taglist.containsKey(s)) {
List<String> tags = imageTag.taglist.get(s);
tags.add(insertedTag);
imageTag.taglist.put(s, tags);
} else {
List<String> tags = new ArrayList<String>();
tags.add(insertedTag);
imageTag.taglist.put(s,tags);
}
if (imageTag.tagCounter.containsKey(insertedTag)) {
imageTag.tagCounter.put(insertedTag,imageTag.tagCounter.get(insertedTag)+1);
} else {
imageTag.tagCounter.put(insertedTag,1);
}
}
}
}
String taglist = s+": <"+imageTag.taglist.get(s)+">";
System.out.println(taglist);
bwOutput.append(taglist);
bwOutput.newLine();
//jsonData.getJSONObject("preload").getJSONObject("illust").getJSONObject(s).getJSONObject("tags");
} else {
System.out.println("Skipping image "+s+" because webpage cannot be found.");
utils.logToFile(s+"\n", "skippedItems.txt");
}
String taglist = s+": <"+imageTag.taglist.get(s)+">";
System.out.println(taglist);
bwOutput.append(taglist);
bwOutput.newLine();
//jsonData.getJSONObject("preload").getJSONObject("illust").getJSONObject(s).getJSONObject("tags");
} else {
System.out.println("Skipping image "+s+" because it has already been processed.");
}
} catch (IOException e) {
e.printStackTrace();
if (e instanceof FileNotFoundException) {
System.out.println("Skipping image "+s+" because webpage cannot be found.");
utils.logToFile(s, "skippedItems.txt");
} else {
e.printStackTrace();
}
}
/*org.apache.commons.io.FileUtils.copyURLToFile(new URL(
url

@ -1,6 +1,7 @@
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
@ -256,7 +257,7 @@ public class utils {
return lastSlashpos;
}
public static void downloadFileFromUrl(String url, String file) throws IOException, JSONException {
public static void downloadFileFromUrl(String url, String file) throws IOException, JSONException, FileNotFoundException {
String temp = url.substring(0,LastSlash(url));
String temp2 = url.substring(LastSlash(url));
@ -301,11 +302,7 @@ public class utils {
return json;
}
/*public static void logToFile(String message, String filename) {
logToFile(message,filename,false);
}*/
/*public static void logToFile(String message, String filename, boolean outputToChatLog) {
public static void logToFile(String message, String filename) {
File file = new File(filename);
try {
@ -322,10 +319,7 @@ public class utils {
} catch (IOException e) {
e.printStackTrace();
}
if (outputToChatLog && sigIRC.chatlogmodule_enabled) {
ChatLogMessage.importMessages(message);
}
}*/
}
public static void writetoFile(String[] data, String filename) {
File file = new File(filename);

Loading…
Cancel
Save